resize_gpu_float.cu
#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>
inline __device__
float __char_as_float(uchar b8)
{
return __uint2float_rn(b8) / 127.5f - 1.f;
}
inline __device__
uchar __float2uchar(float f16)
{
return __float2uint_rd(f16);
// return __float2uint_rd(f16 * 255.f);
}
static inline void _safe_cuda_call(cudaError err, const char* msg, const char* file_name, const int line_number)
{
if(err!=cudaSuccess)
{
fprintf(stderr,"%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n",msg,file_name,line_number,cudaGetErrorString(err));
std::cin.get();
exit(EXIT_FAILURE);
}
}
#define SAFE_CALL(call,msg) _safe_cuda_call((call),(msg),__FILE__,__LINE__)
inline __device__
int clip(int x, int a, int b)
{
return x >= a ? (x < b ? x : b - 1) : a;
}
// enlarge the original image k times in x and y direction
// write dataOut coalesced
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, int k)
{
int xIdx = threadIdx.x + blockIdx.x * blockDim.x;
int yIdx = threadIdx.y + blockIdx.y * blockDim.y;
if(xIdx < imgWidth_k && yIdx < imgHeight_k)
{
dataOut[yIdx * imgWidth_k + xIdx] = dataIn[(yIdx / k) * imgWidth + xIdx / k];
}
}
// bilinear interpolation, enlarge k times in x and y direction
// 浮点型乘法
__global__ void inter_liner_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, float scale)
{
// __shared__ float shared_eles[34][34];
int xIdx = threadIdx.x + blockIdx.x * blockDim.x;
int yIdx = threadIdx.y + blockIdx.y * blockDim.y;
if(xIdx < imgWidth_k && yIdx < imgHeight_k)
{
float fx = (float)((xIdx + 0.5f) * scale - 0.5f);
int sx = floorf(fx);
fx -= sx;
sx = min(sx, imgWidth - 1);
int sx2 = min(sx + 1, imgWidth - 1);
if(sx < 0)
sx2 = 0, sx = 0;
float2 cbufx;
cbufx.x = 1.f - fx;
cbufx.y = fx;
float fy = (float)((yIdx + 0.5f) * scale - 0.5f);
int sy = floorf(fy);
fy -= sy;
sy = min(sy, imgHeight - 1);
int sy2 = min(sy + 1, imgHeight - 1);
if(sy < 0)
sy2 = 0, sy = 0;
float2 cbufy;
cbufy.x = 1.f - fy;
cbufy.y = fy;
// if(sx % 2 == 0 || sy % 2 == 0)
// uchar3 s11 = {0,0,0}, s12 = {0,0,0}, s21 = {0,0,0}, s22 = {0,0,0};
// 从global memory加载数据花费1500 us
uchar3 s11 = dataIn[sy * imgWidth + sx];
uchar3 s12 = dataIn[sy * imgWidth + sx2];
uchar3 s21 = dataIn[sy2 * imgWidth + sx];
uchar3 s22 = dataIn[sy2 * imgWidth + sx2];
// __syncthreads();
float h_rst00x, h_rst01x, h_rst00y, h_rst01y, h_rst00z, h_rst01z;
h_rst00x = s11.x * cbufx.x + s12.x * cbufx.y;
h_rst01x = s21.x * cbufx.x + s22.x * cbufx.y;
h_rst00y = s11.y * cbufx.x + s12.y * cbufx.y;
h_rst01y = s21.y * cbufx.x + s22.y * cbufx.y;
h_rst00z = s11.z * cbufx.x + s12.z * cbufx.y;
h_rst01z = s21.z * cbufx.x + s22.z * cbufx.y;
// 写入global memory花费1500 us, 所有其他的运算花费1000 us
dataOut[yIdx*imgWidth_k + xIdx].x = h_rst00x * cbufy.x + h_rst01x * cbufy.y; // B
dataOut[yIdx*imgWidth_k + xIdx].y = h_rst00y * cbufy.x + h_rst01y * cbufy.y; // G
dataOut[yIdx*imgWidth_k + xIdx].z = h_rst00z * cbufy.x + h_rst01z * cbufy.y; // R
}
}
int main(void)
{
int k = 11;
float scale = 1.f / (float) k;
cv::Mat img_ori = cv::imread("lisfan-70.jpg");
int imgWidth = img_ori.cols;
int imgHeight = img_ori.rows;
int imgHeight_k = imgHeight * k;
int imgWidth_k = imgWidth * k;
std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行
cv::Mat img_resize_cpu;
auto start_cpu = std::chrono::steady_clock::now();
// https://docs.opencv.org/4.1.1/da/d54/group__imgproc__transform.html#ga5bb5a1fea74ea38e1a5445ca803ff121
resize(img_ori, img_resize_cpu, cv::Size(imgWidth_k, imgHeight_k), 0, 0, cv::INTER_LINEAR); // INTER_LINEAR INTER_NEAREST
auto end_cpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
std::cout<< "CPU resize time: " << elapsed_cpu.count() << " us" << std::endl;
// k=1, 525.007 us; k=3, 2533.98 us; k=5, 5612.21 us; k=7, 7895.86 us; k=9, 15263.2 us; k=11, 21533.4 us
cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
/// GPU //
cudaEvent_t start, stop;
float runtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
uchar3 *d_in;
uchar3 *d_out;
cv::Mat img_resize_gpu(imgHeight_k, imgWidth_k, CV_8UC3);
SAFE_CALL(cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3)), "cudaMalloc d_in fialed");
SAFE_CALL(cudaMalloc((void**)&d_out, imgHeight_k * imgWidth_k * sizeof(uchar3)), "cudaMalloc d_out failed");
SAFE_CALL(cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice), "d_in cudaMemcpyHostToDevice failed");
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth_k + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight_k + threadsPerBlock.y - 1) / threadsPerBlock.y);
inter_liner_k <<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgHeight, imgWidth, imgHeight_k, imgWidth_k, scale);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &runtime, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
// cudaDeviceSynchronize(); //CPU端计时,需要同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
// k=1, 562.528 us; k=3, 791.072 us; k=5, 1267.71 us; k=7, 1933.28 us; k=9, 2896.38 us; k=11, 3989.31 us
// 不写入global memory,resize完了之后直接用
// k=1, 525.568 us; k=3, 590.449 us; k=5, 754.656 us; k=7, 768.704 us; k=9, 985.76 us; k=11, 1086.08 us
std::cout << "cudaEvent_t time: " << runtime * 1000 << " us" << std::endl;
SAFE_CALL(cudaMemcpy(img_resize_gpu.data, d_out, imgHeight_k * imgWidth_k * sizeof(uchar3), cudaMemcpyDeviceToHost), "d_out cudaMemcpyDeviceToHost failed");
cv::Mat resid(imgHeight_k, imgWidth_k, CV_8UC3);
int diff = 0;
for(int j=0; j<imgHeight_k; j++)
{
for(int i=0; i<imgWidth_k; i++)
{
resid.at<cv::Vec3b>(j, i)[0] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[0] - img_resize_gpu.at<cv::Vec3b>(j, i)[0]);
resid.at<cv::Vec3b>(j, i)[1] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[1] - img_resize_gpu.at<cv::Vec3b>(j, i)[1]);
resid.at<cv::Vec3b>(j, i)[2] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[2] - img_resize_gpu.at<cv::Vec3b>(j, i)[2]);
if(abs(img_resize_cpu.at<cv::Vec3b>(j, i)[0] - img_resize_gpu.at<cv::Vec3b>(j, i)[0]) > 0 ||
abs(img_resize_cpu.at<cv::Vec3b>(j, i)[1] - img_resize_gpu.at<cv::Vec3b>(j, i)[1]) > 0 ||
abs(img_resize_cpu.at<cv::Vec3b>(j, i)[2] - img_resize_gpu.at<cv::Vec3b>(j, i)[2]) > 0) // > 1 diff: 0
{
diff += 1;
}
}
}
std::cout << "diff: " << diff << std::endl;
SAFE_CALL(cudaFree(d_in), "free d_in failed");
SAFE_CALL(cudaFree(d_out), "free d_out failed");
cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);
cv::imwrite("cpu_gpu_residual_" + std::to_string(k) + ".jpg", resid);
return 0;
}
浮点型乘法比opencv自带的版本大概快5倍左右,
img_ori.cols: 720
img_ori.rows: 540
CPU resize time: 23108.1 us
cudaEvent_t time: 4000.26 us
diff: 27521005
===================================================================
resize_gpu_int.cu
#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>
#include <bitset>
inline __device__
float __char_as_float(uchar b8)
{
return __uint2float_rn(b8) / 127.5f - 1.f;
}
inline __device__
uchar __float2uchar(float f16)
{
return __float2uint_rd(f16);
// return __float2uint_rd(f16 * 255.f);
}
inline __device__
int clip(int x, int a, int b)
{
return x >= a ? (x < b ? x : b - 1) : a;
}
// enlarge the original image k times in x and y direction
// write dataOut coalesced
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, int k)
{
int xIdx = threadIdx.x + blockIdx.x * blockDim.x;
int yIdx = threadIdx.y + blockIdx.y * blockDim.y;
if(xIdx < imgWidth_k && yIdx < imgHeight_k)
{
dataOut[yIdx * imgWidth_k + xIdx] = dataIn[(yIdx / k) * imgWidth + xIdx / k];
}
}
// bilinear interpolation, enlarge k times in x and y direction
// 整型移位运算代替浮点型乘法
__global__ void inter_liner_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, float scale)
{
int xIdx = threadIdx.x + blockIdx.x * blockDim.x;
int yIdx = threadIdx.y + blockIdx.y * blockDim.y;
if(xIdx < imgWidth_k && yIdx < imgHeight_k)
{
float fx = (float)((xIdx + 0.5f) * scale - 0.5f);
int sx = floorf(fx);
fx -= sx;
sx = min(sx, imgWidth - 1);
int sx2 = min(sx + 1, imgWidth - 1);
if(sx < 0)
sx2 = 0;
sx = max(0, sx);
int2 cbufx;
cbufx.x = min(__float2uint_rd((1.f - fx) * 2048), 2048);
cbufx.x = max(0, cbufx.x);
cbufx.y = 2048 - cbufx.x;
float fy = (float)((yIdx + 0.5f) * scale - 0.5f);
int sy = floorf(fy);
fy -= sy;
sy = min(sy, imgHeight - 1);
int sy2 = min(sy + 1, imgHeight - 1);
if(sy < 0)
sy2 = 0;
sy = max(0, sy);
int2 cbufy;
cbufy.x = min(__float2uint_rd((1.f - fy) * 2048), 2048);
cbufy.x = max(0, cbufy.x);
cbufy.y = 2048 - cbufy.x;
uchar3 d11 = dataIn[sy * imgWidth + sx];
uchar3 d12 = dataIn[sy * imgWidth + sx2];
uchar3 d21 = dataIn[sy2 * imgWidth + sx];
uchar3 d22 = dataIn[sy2 * imgWidth + sx2];
int h_rst00x, h_rst01x, h_rst00y, h_rst01y, h_rst00z, h_rst01z;
h_rst00x = d11.x * cbufx.x + d12.x * cbufx.y;
h_rst01x = d21.x * cbufx.x + d22.x * cbufx.y;
h_rst00y = d11.y * cbufx.x + d12.y * cbufx.y;
h_rst01y = d21.y * cbufx.x + d22.y * cbufx.y;
h_rst00z = d11.z * cbufx.x + d12.z * cbufx.y;
h_rst01z = d21.z * cbufx.x + d22.z * cbufx.y;
dataOut[yIdx*imgWidth_k + xIdx].x = h_rst00x * cbufy.x + h_rst01x * cbufy.y >> 22; // B
dataOut[yIdx*imgWidth_k + xIdx].y = h_rst00y * cbufy.x + h_rst01y * cbufy.y >> 22; // G
dataOut[yIdx*imgWidth_k + xIdx].z = h_rst00z * cbufy.x + h_rst01z * cbufy.y >> 22; // R
}
}
int main(void)
{
int k = 1;
float scale = 1.f / (float) k;
cv::Mat img_ori = cv::imread("lisfan-70.jpg");
int imgWidth = img_ori.cols;
int imgHeight = img_ori.rows;
int imgHeight_k = imgHeight * k;
int imgWidth_k = imgWidth * k;
std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行
cv::Mat img_resize_cpu;
auto start_cpu = std::chrono::steady_clock::now();
// https://docs.opencv.org/4.1.1/da/d54/group__imgproc__transform.html#ga5bb5a1fea74ea38e1a5445ca803ff121
resize(img_ori, img_resize_cpu, cv::Size(imgWidth_k, imgHeight_k), 0, 0, cv::INTER_LINEAR); // INTER_LINEAR INTER_NEAREST
auto end_cpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
std::cout<< "CPU resize time: " << elapsed_cpu.count() << " us" << std::endl;
// k=1, 541.203 us; k=3, 2155.18 us; k=5, 5612.21 us; k=7, 9993.43 us; k=9, 16524.4 us; k=11, 21533.4 us
cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
/// GPU //
cudaEvent_t start, stop;
float runtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
uchar3 *d_in;
uchar3 *d_out;
cv::Mat img_resize_gpu(imgHeight_k, imgWidth_k, CV_8UC3);
cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
cudaMalloc((void**)&d_out, imgHeight_k * imgWidth_k * sizeof(uchar3));
cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth_k + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight_k + threadsPerBlock.y - 1) / threadsPerBlock.y);
inter_liner_k <<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgHeight, imgWidth, imgHeight_k, imgWidth_k, scale);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &runtime, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
// cudaDeviceSynchronize(); //CPU端计时,需要同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
// k=1, 549.528 us; k=3, 736.576 us; k=5, 1135.58 us; k=7, 1718.05 us; k=9, 2472.19 us; k=11, 3426.85 us
// 不写入global memory,resize完了之后直接用
// k=1, 525.568 us; k=3, 590.449 us; k=5, 754.656 us; k=7, 768.704 us; k=9, 985.76 us; k=11, 1086.08 us
std::cout << "cudaEvent_t time: " << runtime * 1000 << " us" << std::endl;
cudaMemcpy(img_resize_gpu.data, d_out, imgHeight_k * imgWidth_k * sizeof(uchar3), cudaMemcpyDeviceToHost);
cv::Mat resid(imgHeight_k, imgWidth_k, CV_8UC3);
int diff = 0;
for(int j=0; j<imgHeight_k; j++)
{
for(int i=0; i<imgWidth_k; i++)
{
resid.at<cv::Vec3b>(j, i)[0] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[0] - img_resize_gpu.at<cv::Vec3b>(j, i)[0]);
resid.at<cv::Vec3b>(j, i)[1] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[1] - img_resize_gpu.at<cv::Vec3b>(j, i)[1]);
resid.at<cv::Vec3b>(j, i)[2] = 100 * (img_resize_cpu.at<cv::Vec3b>(j, i)[2] - img_resize_gpu.at<cv::Vec3b>(j, i)[2]);
if(abs(img_resize_cpu.at<cv::Vec3b>(j, i)[0] - img_resize_gpu.at<cv::Vec3b>(j, i)[0]) > 0 ||
abs(img_resize_cpu.at<cv::Vec3b>(j, i)[1] - img_resize_gpu.at<cv::Vec3b>(j, i)[1]) > 0 ||
abs(img_resize_cpu.at<cv::Vec3b>(j, i)[2] - img_resize_gpu.at<cv::Vec3b>(j, i)[2]) > 0) // > 1 diff: 0
{
diff += 1;
}
}
}
std::cout << "diff: " << diff << std::endl;
cudaFree(d_in);
cudaFree(d_out);
cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);
cv::imwrite("cpu_gpu_residual_" + std::to_string(k) + ".jpg", resid);
return 0;
}
整数移位算法比浮点型乘法快15%,参考【OpenCV】双线性插值法_Y忍冬草-CSDN博客_opencv 双线性插值
我自己写的和opencv 的resize精度有些差别,参考 关于opencv的resize - 知乎
CMakeLists.txt
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(resize_gpu_bilinear)
FIND_PACKAGE(OpenCV REQUIRED)
# ibopencv_imgcodecs
FIND_PACKAGE(CUDA REQUIRED)
CUDA_ADD_EXECUTABLE(resize_gpu_float resize_gpu_float.cu)
TARGET_LINK_LIBRARIES(resize_gpu_float ${OpenCV_LIBS})
CUDA_ADD_EXECUTABLE(resize_gpu_int resize_gpu_int.cu)
TARGET_LINK_LIBRARIES(resize_gpu_int ${OpenCV_LIBS})
# set(CMAKE_CXX_FLAGS "-std=c++11") # 加不加这行都可以
# 直接鼠标右键Run Code不能跑
# mkdir build && cd build
# cd ..
# ./build/display_image
大部分pixel会被加载4次,使用shared memory应该能更快一些,但是先做点别的吧