cuda编程入门(3)—— 实现矩阵乘法(一)
目录
- grid 线程循环矩阵乘法
- block线程循环矩阵乘法
- 行共享储存矩阵乘法
grid 线程循环矩阵乘法
// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
const int row = idx / m;
const int col = idx % m;
if(row < m && col < n) {
for(int i = 0; i < l; i++) {
z[row*n+ col] += x[row*l + i] * y[i*n + col];
}
}
}
int main()
{
int M = 2048;
int L = 1024;
int N = 512;
// 申请host内存
float *x = NULL;
float *y = NULL;
float *z = NULL;
x = (float*)malloc(M*L*sizeof(float));
y = (float*)malloc(L*N*sizeof(float));
z = (float*)malloc(M*N*sizeof(float));
if(x == NULL || y == NULL || z == NULL)
return 0;
// 初始化数据
for (int i = 0; i < M; ++i) {
for (int j = 0; j < L; ++j) {
x[i*L + j] = 1.1;
}
}
for (int i = 0; i < L; ++i) {
for (int j = 0; j < N; ++j) {
y[i*N + j] = 1.1;
}
}
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
z[i*N + j] = 0;
}
}
// 申请device内存
float *d_x, *d_y, *d_z;
cudaMalloc((void**)&d_x, M*L*sizeof(float));
cudaMalloc((void**)&d_y, L*N*sizeof(float));
cudaMalloc((void**)&d_z, M*N*sizeof(float));
// 将host数据拷贝到device
cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);
// 定义kernel的执行配置
dim3 threads(1024);
dim3 blocks(M*N-1024+1/1024);
matrix_mul <<<blocks, threads>>>(d_x, d_y, d_z, M, N, L);
// 将device得到的结果拷贝到host
cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);
// 输出前10个数值
for(int i = 0; i < 10; i++) {
std::cout << z[i] << " ";
}
std::cout << std::endl;
std::cout << "Done!" << std::endl;
// 释放device内存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
// 释放host内存
free(x);
free(y);
free(z);
return 0;
}
运行分析:
==73446== NVPROF is profiling process 73446, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04
Done!
==73446== Profiling application: ./matrix_mul
==73446== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 95.86% 60.227ms 1 60.227ms 60.227ms 60.227ms matrix_mul(float*, float*, float*, int, int, int)
3.10% 1.9449ms 2 972.46us 312.74us 1.6322ms [CUDA memcpy HtoD]
1.04% 653.50us 1 653.50us 653.50us 653.50us [CUDA memcpy DtoH]
API calls: 82.68% 327.03ms 3 109.01ms 133.11us 326.75ms cudaMalloc
16.08% 63.619ms 3 21.206ms 539.11us 61.260ms cudaMemcpy
0.55% 2.1690ms 4 542.25us 395.22us 615.75us cuDeviceTotalMem
0.34% 1.3547ms 404 3.3530us 127ns 161.59us cuDeviceGetAttribute
0.28% 1.1255ms 3 375.16us 255.29us 480.30us cudaFree
0.04% 150.00us 4 37.500us 30.600us 50.728us cuDeviceGetName
0.01% 53.870us 1 53.870us 53.870us 53.870us cudaLaunchKernel
0.00% 12.844us 4 3.2110us 1.1030us 8.0160us cuDeviceGetPCIBusId
0.00% 6.2510us 8 781ns 142ns 3.0790us cuDeviceGet
0.00% 979ns 3 326ns 190ns 558ns cuDeviceGetCount
0.00% 882ns 4 220ns 164ns 305ns cuDeviceGetUuid
block线程循环矩阵乘法
// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
int bidx = blockIdx.x;
int tidx = threadIdx.x;
// for(; bidx < m; bidx += gridDim.x)
{
for(;tidx < n; tidx += blockDim.x) {
for(int i = 0; i < l; i++) {
z[bidx*n + tidx] += x[bidx*l + i] * y[i*n + tidx];
}
}
}
}
int main()
{
int M = 2048;
int L = 1024;
int N = 512;
// 申请host内存
float *x = NULL;
float *y = NULL;
float *z = NULL;
x = (float*)malloc(M*L*sizeof(float));
y = (float*)malloc(L*N*sizeof(float));
z = (float*)malloc(M*N*sizeof(float));
if(x == NULL || y == NULL || z == NULL)
return 0;
// 初始化数据
for (int i = 0; i < M; ++i) {
for (int j = 0; j < L; ++j) {
x[i*L + j] = 1.1;
}
}
for (int i = 0; i < L; ++i) {
for (int j = 0; j < N; ++j) {
y[i*N + j] = 1.1;
}
}
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
z[i*N + j] = 0;
}
}
// 申请device内存
float *d_x, *d_y, *d_z;
cudaMalloc((void**)&d_x, M*L*sizeof(float));
cudaMalloc((void**)&d_y, L*N*sizeof(float));
cudaMalloc((void**)&d_z, M*N*sizeof(float));
// 将host数据拷贝到device
cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);
// 定义kernel的执行配置
dim3 blocks(M);
dim3 threads(1024);
matrix_mul <<<blocks, threads>>>(d_x, d_y, d_z, M, N, L);
// 将device得到的结果拷贝到host
cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);
// 输出前10个数值
for(int i = 0; i < 10; i++) {
std::cout << z[i] << " ";
}
std::cout << std::endl;
std::cout << "Done!" << std::endl;
// 释放device内存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
// 释放host内存
free(x);
free(y);
free(z);
return 0;
}
==74746== NVPROF is profiling process 74746, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04
Done!
==74746== Profiling application: ./matrix_mul
==74746== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 89.02% 20.431ms 1 20.431ms 20.431ms 20.431ms matrix_mul(float*, float*, float*, int, int, int)
9.16% 2.1021ms 2 1.0510ms 334.56us 1.7675ms [CUDA memcpy HtoD]
1.82% 418.82us 1 418.82us 418.82us 418.82us [CUDA memcpy DtoH]
API calls: 91.47% 299.76ms 3 99.921ms 316.68us 298.24ms cudaMalloc
7.18% 23.523ms 3 7.8409ms 569.60us 20.994ms cudaMemcpy
0.65% 2.1442ms 4 536.06us 389.86us 611.69us cuDeviceTotalMem
0.40% 1.3031ms 404 3.2250us 128ns 153.61us cuDeviceGetAttribute
0.23% 753.67us 3 251.22us 144.11us 405.16us cudaFree
0.04% 143.95us 4 35.987us 29.223us 54.174us cuDeviceGetName
0.02% 54.046us 1 54.046us 54.046us 54.046us cudaLaunchKernel
0.00% 14.751us 4 3.6870us 1.6850us 8.8690us cuDeviceGetPCIBusId
0.00% 4.3010us 8 537ns 139ns 2.8450us cuDeviceGet
0.00% 1.0200us 3 340ns 186ns 599ns cuDeviceGetCount
0.00% 841ns 4 210ns 167ns 313ns cuDeviceGetUuid
行共享储存矩阵乘法
// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
int bidx = blockIdx.x;
int tidx = threadIdx.x;
extern __shared__ float data[];
for(int i = tidx; i < l; i += blockDim.x) {
data[i] = x[bidx*l + i];
}
// 注意调用这个函数保证该 block 里面所有的线程同步,
// 因为该 block 里面所有的线程需要协同工作,一起将 m*l 矩阵中的第 bidx 行的元素写入 data 中。
__syncthreads();
// for(; bidx < m; bidx += gridDim.x)
{
for(;tidx < n; tidx += blockDim.x) {
for(int i = 0; i < l; i++) {
z[bidx*n + tidx] += data[i] * y[i*n + tidx];
}
}
}
}
int main()
{
int M = 2048;
int L = 1024;
int N = 512;
// 申请host内存
float *x = NULL;
float *y = NULL;
float *z = NULL;
x = (float*)malloc(M*L*sizeof(float));
y = (float*)malloc(L*N*sizeof(float));
z = (float*)malloc(M*N*sizeof(float));
if(x == NULL || y == NULL || z == NULL)
return 0;
// 初始化数据
for (int i = 0; i < M; ++i) {
for (int j = 0; j < L; ++j) {
x[i*L + j] = 1.1;
}
}
for (int i = 0; i < L; ++i) {
for (int j = 0; j < N; ++j) {
y[i*N + j] = 1.1;
}
}
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
z[i*N + j] = 0;
}
}
// 申请device内存
float *d_x, *d_y, *d_z;
cudaMalloc((void**)&d_x, M*L*sizeof(float));
cudaMalloc((void**)&d_y, L*N*sizeof(float));
cudaMalloc((void**)&d_z, M*N*sizeof(float));
// 将host数据拷贝到device
cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);
// 定义kernel的执行配置
dim3 blocks(M);
dim3 threads(1024);
matrix_mul <<<blocks, threads, sizeof(float)*L>>>(d_x, d_y, d_z, M, N, L);
// 将device得到的结果拷贝到host
cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);
// 输出前10个数值
for(int i = 0; i < 10; i++) {
std::cout << z[i] << " ";
}
std::cout << std::endl;
std::cout << "Done!" << std::endl;
// 释放device内存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
// 释放host内存
free(x);
free(y);
free(z);
return 0;
}
运行如下:
==77269== NVPROF is profiling process 77269, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04
Done!
==77269== Profiling application: ./matrix_mul
==77269== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 88.75% 19.976ms 1 19.976ms 19.976ms 19.976ms matrix_mul(float*, float*, float*, int, int, int)
9.33% 2.1003ms 2 1.0502ms 324.61us 1.7757ms [CUDA memcpy HtoD]
1.91% 430.65us 1 430.65us 430.65us 430.65us [CUDA memcpy DtoH]
API calls: 92.37% 333.05ms 3 111.02ms 136.02us 332.75ms cudaMalloc
6.40% 23.063ms 3 7.6878ms 554.56us 20.552ms cudaMemcpy
0.59% 2.1253ms 4 531.34us 383.96us 608.21us cuDeviceTotalMem
0.38% 1.3548ms 404 3.3530us 126ns 159.10us cuDeviceGetAttribute
0.21% 748.13us 3 249.38us 139.12us 407.51us cudaFree
0.04% 140.10us 4 35.025us 29.517us 48.728us cuDeviceGetName
0.02% 59.697us 1 59.697us 59.697us 59.697us cudaLaunchKernel
0.00% 13.002us 4 3.2500us 1.0700us 9.4180us cuDeviceGetPCIBusId
0.00% 4.7540us 8 594ns 138ns 2.6860us cuDeviceGet
0.00% 1.0420us 3 347ns 200ns 617ns cuDeviceGetCount
0.00% 868ns 4 217ns 167ns 273ns cuDeviceGetUuid