cuda编程入门（4）—— 实现矩阵乘法(二)-CFANZ编程社区

cuda编程入门（3）—— 实现矩阵乘法(一)

grid 线程循环矩阵乘法

cuda编程入门（4）—— 实现矩阵乘法(二)_c++

#include <iostream>
#include "cuda_runtime.h"  

// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  const int row = idx / m;
  const int col = idx % m;
  if(row < m && col < n) {
    for(int i = 0; i < l; i++) {
      z[row*n+ col] += x[row*l + i] * y[i*n + col];
    }
  }
}

int main()
{
  int M = 2048;
  int L = 1024;
  int N = 512;

  // 申请host内存
  float *x = NULL;
  float *y = NULL;
  float *z = NULL;
  x = (float*)malloc(M*L*sizeof(float));
  y = (float*)malloc(L*N*sizeof(float));
  z = (float*)malloc(M*N*sizeof(float));

  if(x == NULL || y == NULL || z == NULL)
    return 0;
  
  // 初始化数据
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < L; ++j) {
      x[i*L + j] = 1.1;
    }
  }
  for (int i = 0; i < L; ++i) {
    for (int j = 0; j < N; ++j) {
      y[i*N + j] = 1.1;
    }
  }
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      z[i*N + j] = 0;
    }
  }

  // 申请device内存
  float *d_x, *d_y, *d_z;
  cudaMalloc((void**)&d_x, M*L*sizeof(float));
  cudaMalloc((void**)&d_y, L*N*sizeof(float));
  cudaMalloc((void**)&d_z, M*N*sizeof(float));

  // 将host数据拷贝到device
  cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);

  // 定义kernel的执行配置
  dim3 threads(1024);
  dim3 blocks(M*N-1024+1/1024);
  matrix_mul <<<blocks, threads>>>(d_x, d_y, d_z, M, N, L);

  // 将device得到的结果拷贝到host
  cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);

  // 输出前10个数值
  for(int i = 0; i < 10; i++) {
    std::cout << z[i] << " ";
  }
  std::cout << std::endl;

  std::cout << "Done!" << std::endl;

  // 释放device内存
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);
  // 释放host内存
  free(x);
  free(y);
  free(z);

  return 0;
}

运行分析：

==73446== NVPROF is profiling process 73446, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 
Done!
==73446== Profiling application: ./matrix_mul
==73446== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.86%  60.227ms         1  60.227ms  60.227ms  60.227ms  matrix_mul(float*, float*, float*, int, int, int)
                    3.10%  1.9449ms         2  972.46us  312.74us  1.6322ms  [CUDA memcpy HtoD]
                    1.04%  653.50us         1  653.50us  653.50us  653.50us  [CUDA memcpy DtoH]
      API calls:   82.68%  327.03ms         3  109.01ms  133.11us  326.75ms  cudaMalloc
                   16.08%  63.619ms         3  21.206ms  539.11us  61.260ms  cudaMemcpy
                    0.55%  2.1690ms         4  542.25us  395.22us  615.75us  cuDeviceTotalMem
                    0.34%  1.3547ms       404  3.3530us     127ns  161.59us  cuDeviceGetAttribute
                    0.28%  1.1255ms         3  375.16us  255.29us  480.30us  cudaFree
                    0.04%  150.00us         4  37.500us  30.600us  50.728us  cuDeviceGetName
                    0.01%  53.870us         1  53.870us  53.870us  53.870us  cudaLaunchKernel
                    0.00%  12.844us         4  3.2110us  1.1030us  8.0160us  cuDeviceGetPCIBusId
                    0.00%  6.2510us         8     781ns     142ns  3.0790us  cuDeviceGet
                    0.00%     979ns         3     326ns     190ns     558ns  cuDeviceGetCount
                    0.00%     882ns         4     220ns     164ns     305ns  cuDeviceGetUuid

cuda编程入门（4）—— 实现矩阵乘法(二)_i++_02

block线程循环矩阵乘法

cuda编程入门（4）—— 实现矩阵乘法(二)_i++_03

#include <iostream>
#include "cuda_runtime.h"  

// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
  int bidx = blockIdx.x;
  int tidx = threadIdx.x;
  // for(; bidx < m; bidx += gridDim.x)
  {
    for(;tidx < n; tidx += blockDim.x) {
      for(int i = 0; i < l; i++) {
        z[bidx*n + tidx] += x[bidx*l + i] * y[i*n + tidx];
      }
    }
  }
}

int main()
{
  int M = 2048;
  int L = 1024;
  int N = 512;

  // 申请host内存
  float *x = NULL;
  float *y = NULL;
  float *z = NULL;
  x = (float*)malloc(M*L*sizeof(float));
  y = (float*)malloc(L*N*sizeof(float));
  z = (float*)malloc(M*N*sizeof(float));

  if(x == NULL || y == NULL || z == NULL)
    return 0;
  
  // 初始化数据
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < L; ++j) {
      x[i*L + j] = 1.1;
    }
  }
  for (int i = 0; i < L; ++i) {
    for (int j = 0; j < N; ++j) {
      y[i*N + j] = 1.1;
    }
  }
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      z[i*N + j] = 0;
    }
  }

  // 申请device内存
  float *d_x, *d_y, *d_z;
  cudaMalloc((void**)&d_x, M*L*sizeof(float));
  cudaMalloc((void**)&d_y, L*N*sizeof(float));
  cudaMalloc((void**)&d_z, M*N*sizeof(float));

  // 将host数据拷贝到device
  cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);

  // 定义kernel的执行配置
  dim3 blocks(M);
  dim3 threads(1024);
  matrix_mul <<<blocks, threads>>>(d_x, d_y, d_z, M, N, L);

  // 将device得到的结果拷贝到host
  cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);

  // 输出前10个数值
  for(int i = 0; i < 10; i++) {
    std::cout << z[i] << " ";
  }
  std::cout << std::endl;

  std::cout << "Done!" << std::endl;

  // 释放device内存
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);
  // 释放host内存
  free(x);
  free(y);
  free(z);

  return 0;
}

==74746== NVPROF is profiling process 74746, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 
Done!
==74746== Profiling application: ./matrix_mul
==74746== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.02%  20.431ms         1  20.431ms  20.431ms  20.431ms  matrix_mul(float*, float*, float*, int, int, int)
                    9.16%  2.1021ms         2  1.0510ms  334.56us  1.7675ms  [CUDA memcpy HtoD]
                    1.82%  418.82us         1  418.82us  418.82us  418.82us  [CUDA memcpy DtoH]
      API calls:   91.47%  299.76ms         3  99.921ms  316.68us  298.24ms  cudaMalloc
                    7.18%  23.523ms         3  7.8409ms  569.60us  20.994ms  cudaMemcpy
                    0.65%  2.1442ms         4  536.06us  389.86us  611.69us  cuDeviceTotalMem
                    0.40%  1.3031ms       404  3.2250us     128ns  153.61us  cuDeviceGetAttribute
                    0.23%  753.67us         3  251.22us  144.11us  405.16us  cudaFree
                    0.04%  143.95us         4  35.987us  29.223us  54.174us  cuDeviceGetName
                    0.02%  54.046us         1  54.046us  54.046us  54.046us  cudaLaunchKernel
                    0.00%  14.751us         4  3.6870us  1.6850us  8.8690us  cuDeviceGetPCIBusId
                    0.00%  4.3010us         8     537ns     139ns  2.8450us  cuDeviceGet
                    0.00%  1.0200us         3     340ns     186ns     599ns  cuDeviceGetCount
                    0.00%     841ns         4     210ns     167ns     313ns  cuDeviceGetUuid

cuda编程入门（4）—— 实现矩阵乘法(二)_c++_04

行共享储存矩阵乘法

cuda编程入门（4）—— 实现矩阵乘法(二)_算法_05
cuda编程入门（4）—— 实现矩阵乘法(二)_算法_06

#include <iostream>
#include "cuda_runtime.h"  

// 矩阵乘法
// m*l l*n
__global__ void matrix_mul(float* x, float * y, float* z, int m, int n, int l)
{
  int bidx = blockIdx.x;
  int tidx = threadIdx.x;
  extern __shared__ float data[];
  for(int i = tidx; i < l; i += blockDim.x) {
    data[i] = x[bidx*l + i];
  }

  // 注意调用这个函数保证该 block 里面所有的线程同步， 
  // 因为该 block 里面所有的线程需要协同工作，一起将 m*l 矩阵中的第 bidx 行的元素写入 data 中。
  __syncthreads();

  // for(; bidx < m; bidx += gridDim.x)
  {
    for(;tidx < n; tidx += blockDim.x) {
      for(int i = 0; i < l; i++) {
        z[bidx*n + tidx] += data[i] * y[i*n + tidx];
      }
    }
  }
}

int main()
{
  int M = 2048;
  int L = 1024;
  int N = 512;

  // 申请host内存
  float *x = NULL;
  float *y = NULL;
  float *z = NULL;
  x = (float*)malloc(M*L*sizeof(float));
  y = (float*)malloc(L*N*sizeof(float));
  z = (float*)malloc(M*N*sizeof(float));

  if(x == NULL || y == NULL || z == NULL)
    return 0;
  
  // 初始化数据
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < L; ++j) {
      x[i*L + j] = 1.1;
    }
  }
  for (int i = 0; i < L; ++i) {
    for (int j = 0; j < N; ++j) {
      y[i*N + j] = 1.1;
    }
  }
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      z[i*N + j] = 0;
    }
  }

  // 申请device内存
  float *d_x, *d_y, *d_z;
  cudaMalloc((void**)&d_x, M*L*sizeof(float));
  cudaMalloc((void**)&d_y, L*N*sizeof(float));
  cudaMalloc((void**)&d_z, M*N*sizeof(float));

  // 将host数据拷贝到device
  cudaMemcpy((void*)d_x, (void*)x, M*L*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy((void*)d_y, (void*)y, L*N*sizeof(float), cudaMemcpyHostToDevice);

  // 定义kernel的执行配置
  dim3 blocks(M);
  dim3 threads(1024);
  matrix_mul <<<blocks, threads, sizeof(float)*L>>>(d_x, d_y, d_z, M, N, L);

  // 将device得到的结果拷贝到host
  cudaMemcpy((void*)z, (void*)d_z, M*N*sizeof(float), cudaMemcpyDeviceToHost);

  // 输出前10个数值
  for(int i = 0; i < 10; i++) {
    std::cout << z[i] << " ";
  }
  std::cout << std::endl;

  std::cout << "Done!" << std::endl;

  // 释放device内存
  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_z);
  // 释放host内存
  free(x);
  free(y);
  free(z);

  return 0;
}

运行如下：

==77269== NVPROF is profiling process 77269, command: ./matrix_mul
1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 1239.04 
Done!
==77269== Profiling application: ./matrix_mul
==77269== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   88.75%  19.976ms         1  19.976ms  19.976ms  19.976ms  matrix_mul(float*, float*, float*, int, int, int)
                    9.33%  2.1003ms         2  1.0502ms  324.61us  1.7757ms  [CUDA memcpy HtoD]
                    1.91%  430.65us         1  430.65us  430.65us  430.65us  [CUDA memcpy DtoH]
      API calls:   92.37%  333.05ms         3  111.02ms  136.02us  332.75ms  cudaMalloc
                    6.40%  23.063ms         3  7.6878ms  554.56us  20.552ms  cudaMemcpy
                    0.59%  2.1253ms         4  531.34us  383.96us  608.21us  cuDeviceTotalMem
                    0.38%  1.3548ms       404  3.3530us     126ns  159.10us  cuDeviceGetAttribute
                    0.21%  748.13us         3  249.38us  139.12us  407.51us  cudaFree
                    0.04%  140.10us         4  35.025us  29.517us  48.728us  cuDeviceGetName
                    0.02%  59.697us         1  59.697us  59.697us  59.697us  cudaLaunchKernel
                    0.00%  13.002us         4  3.2500us  1.0700us  9.4180us  cuDeviceGetPCIBusId
                    0.00%  4.7540us         8     594ns     138ns  2.6860us  cuDeviceGet
                    0.00%  1.0420us         3     347ns     200ns     617ns  cuDeviceGetCount
                    0.00%     868ns         4     217ns     167ns     273ns  cuDeviceGetUuid

cuda编程入门（4）—— 实现矩阵乘法(二)

目录

grid 线程循环矩阵乘法

block线程循环矩阵乘法

行共享储存矩阵乘法