cuda归并排序的shared memory 优化
在之前的 gpu 归并排序的程序中,没有使用shared memory 进行优化。
后来仔细分析之后发现,前面的好几步合并需要多次读取局部内存,如果加载到shared memory中说不定可以实现一定程度的优化。
整个合并过程可以分为两个阶段(如下图所示):
- 第一个阶段:小规模数组的合并
- 在block内部进行,block中用到的数据加载到共享内存,多线程的 thread需在 steps 间同步
- 各个 step 的并行度逐步下降,因为需要合并的数组段数越来越少,空闲线程越来越多。
- 第二个阶段:大规模数组的合并,使用 block 间的并行。同样,efficiency 会逐步降低,因为随着合并的进行,数组的段数越来越少,能同时发挥作用的进程数也越来越小。
感觉归并排序的合并过程,与 reduce 规约操作有相似之处,reduce 是从多个数到一个数,而归并排序是从多段数组到一段数组。
基于上述的操作步骤,代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <vector>
#include <memory>
#include <iostream>
#include <algorithm>
#define BIG (1e7)
// #define DEBUG
using namespace std;
template<typename theIterator> void print(theIterator begin, theIterator end);
template<typename T>
__device__ void
mergeVec_half_device(T *A, T *tmp, const int64_t vSize) {
/* splict the vector A into two halfs
* merge these two half together
*
* tmp is a temporary vector to
* receive the merge result
*/
int64_t left = 0;
int64_t right = left + vSize - 1;
int64_t mid = (left + right) / 2;
int64_t i = left, j = mid + 1, k = left; // index of left half, right half, and the mergeVec
while ((i <= mid) && (j <= right)) {
if (A[i] <= A[j]) {
tmp[k] = A[i];
++i; ++k;
} else {
tmp[k] = A[j];
++j; ++k;
}
}
if (i > mid) {
for (; j <= right; ++j, ++k) {
tmp[k] = A[j];
}
} else {
for (; i <= mid; ++i, ++k) {
tmp[k] = A[i];
}
}
/// copy tmp to A
for (k = left; k <= right; ++k) {
A[k] = tmp[k];
}
}
template<typename T>
__global__ void
mergeVec_half_global(T *A, T *tmp, const int64_t vSize) {
/* splict the vector A into two halfs
* merge these two half together
*
* tmp is a temporary vector to
* receive the merge result
*/
int64_t left = blockIdx.x * vSize;
int64_t right = left + vSize - 1;
int64_t mid = (left + right) / 2;
int64_t i = left, j = mid + 1, k = left; // index of left half, right half, and the mergeVec
while ((i <= mid) && (j <= right)) {
if (A[i] <= A[j]) {
tmp[k] = A[i];
++i; ++k;
} else {
tmp[k] = A[j];
++j; ++k;
}
}
if (i > mid) {
for (; j <= right; ++j, ++k) {
tmp[k] = A[j];
}
} else {
for (; i <= mid; ++i, ++k) {
tmp[k] = A[i];
}
}
/// copy tmp to A
for (k = left; k <= right; ++k) {
A[k] = tmp[k];
}
}
template<typename T>
__global__ void
mergeSort_power2n_shared(T *A, T *tmp, const int64_t vSize) {
/*
* parallely sort each block (with size of power(2, n)) of a vector
* each block is sorted hierarchically by syncthreads
* (synchronize the thread at each step of merge vector)
* when the thread finishes, the sort of each block is finished.
*/
/// copy A and tmp to the shared memory
extern __shared__ T sharedArray[];
T *s_A = &(sharedArray[0]);
T *s_tmp = &(sharedArray[blockDim.x]);
// __shared__ T s_A[blockDim.x], s_tmp[blockDim.x];
s_A[threadIdx.x] = A[blockIdx.x * blockDim.x + threadIdx.x];
s_tmp[threadIdx.x] = tmp[blockIdx.x * blockDim.x + threadIdx.x];
__syncthreads();
/// merge hierarchically
for (int64_t i = 2; i <= blockDim.x; i <<= 1) { // i is the size of vector to be sorted at each step
if (threadIdx.x % i == 0) {
mergeVec_half_device(&(s_A[threadIdx.x]), &(s_tmp[threadIdx.x]), i);
}
__syncthreads();
}
/// data from shared_memory to global memory
A[blockIdx.x * blockDim.x + threadIdx.x] = s_A[threadIdx.x];
tmp[blockIdx.x * blockDim.x + threadIdx.x] = s_tmp[threadIdx.x];
__syncthreads();
}
template<typename theIterator, typename T> void
mergeSort_power2n(theIterator begin, theIterator end, T args) {
/*
sort a vector with size of power(2, n)
*/
clock_t begT, endT;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, 0);
T *dataA, *dataTmp;
const int64_t vSize = end - begin;
cudaMalloc((void**)&dataA, sizeof(*begin) * vSize);
cudaMalloc((void**)&dataTmp, sizeof(*begin) * vSize);
#ifdef DEBUG
int64_t n = 0;
if (vSize >= 2) {
for (int64_t i = 1; i < vSize; i <<= 1) {
n += 1;
}
} else {
return;
}
/// check whether n is correct
if (((int64_t)1 << n) > vSize) {
cerr << "\033[31;1m error! vSize != 2 ** n \033[0m";
exit(-1);
}
#endif
begT = clock();
cudaMemcpy(dataA, &(*begin), sizeof(*begin) * vSize, cudaMemcpyHostToDevice);
/// merge hierarchically
int64_t shared_nums = devProp.sharedMemPerBlock / sizeof(*begin);
int64_t blockSize, maxBlockSize; // threads per block
int64_t gridSize = 1; // blocks per grid
/* here each thread needs 2 nums of shared memory,
hence, use maximum size of shared memory to define the max block size */
maxBlockSize = shared_nums / 2; // partition by array and tmp
/* the maximum block size can not exceed maxThreadsPerBlock of device */
maxBlockSize = min(maxBlockSize, (int64_t)devProp.maxThreadsPerBlock);
if (vSize > maxBlockSize) { // vSize very big
for (blockSize = vSize; blockSize > maxBlockSize; ) {
blockSize >>= 1;
}
gridSize = (vSize + blockSize - 1) / blockSize;
} else { // vSize not that big
blockSize = vSize;
gridSize = 1;
}
#ifdef DEBUG
cout << "devProp.sharedMemPerBlock = " << devProp.sharedMemPerBlock << endl;
cout << "blockSize = " << blockSize << ", gridSize = " << gridSize << endl;
cout << "vSize = " << vSize << ", maxBlockSize = " << maxBlockSize << endl;
cout << "sizeof(*begin) * blockSize * 2 = " << sizeof(*begin) * blockSize * 2 << endl;
#endif
/// sort each block respectively
mergeSort_power2n_shared <<<gridSize, blockSize, sizeof(*begin) * blockSize * 2>>> (dataA, dataTmp, vSize);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
cerr << "mergeSort_power2n_shared, CUDA Error: " << cudaGetErrorString(err) << endl;
// Possibly: exit(-1) if program cannot continue....
}
/// the parallelly merge all blocks
for (int64_t i = blockSize * 2; i <= vSize; i <<= 1) {
mergeVec_half_global <<<vSize / i, 1>>> (dataA, dataTmp, i);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
cerr << "mergeVec_half_global, CUDA Error: " << cudaGetErrorString(err) << endl;
// Possibly: exit(-1) if program cannot continue....
}
// cudaDeviceSynchronize(); // no need to synchronize here, because kernel2 will wait for kernel1
#ifdef DEBUG
cudaMemcpy(&(*begin), dataA, sizeof(*begin) * vSize, cudaMemcpyDeviceToHost);
cout << "merging Vector, vec = ";
print(begin, end);
#endif
}
/// data from device to host
cudaMemcpy(&(*begin), dataA, sizeof(*begin) * vSize, cudaMemcpyDeviceToHost);
endT = clock();
cout << "inside GPU operation, time = " << endT - begT << endl;
cudaFree(dataA);
cudaFree(dataTmp);
}
template<typename theIterator> inline void
mergeSort_power2n(theIterator begin, theIterator end) {
mergeSort_power2n(begin, end, *begin);
}
template<typename theIterator, typename T> void
mergeVec(
theIterator beg1, theIterator end1,
theIterator beg2, theIterator end2,
T args
) {
/*
* merge 2 vectors with arbitrary length
* of each vector
*/
vector<T> tmp((end1 - beg1) + (end2 - beg2));
theIterator i = beg1, j = beg2;
theIterator k = tmp.begin();
while(i != end1 && j != end2) {
if (*i <= *j) {
*k = *i;
++i; ++k;
} else {
*k = *j;
++j; ++k;
}
}
if (i == end1) {
while (j != end2) {
*k = *j;
++j; ++k;
}
} else {
while (i != end1) {
*k = *i;
++i; ++k;
}
}
/// copy tmp to original vectors
k = tmp.begin();
for (i = beg1; i != end1; ++i, ++k) {
*i = *k;
}
for (j = beg2; j != end2; ++j, ++k) {
*j = *k;
}
}
template<typename theIterator> inline void
mergeVec(theIterator beg1, theIterator end1, theIterator beg2, theIterator end2) {
mergeVec(beg1, end1, beg2, end2, *beg1);
}
template<typename vec> void
mergeSort_gpu(vec &A) {
/* can deal with arbitary size of vector */
vector<bool> binA;
int64_t vSize = A.size(), n = A.size();
int64_t one = 1;
while (n > 0) {
if (n & one) {
binA.push_back(true);
} else {
binA.push_back(false);
}
n >>= 1;
}
vector<int64_t> idxVec;
idxVec.push_back(0);
for (int64_t i = 0; i != binA.size(); ++i) {
if (binA[i]) {
idxVec.push_back(idxVec.back() + (one << i));
}
}
for (int64_t i = 0; i != idxVec.size() - 1; ++i) {
mergeSort_power2n(A.begin() + idxVec[i], A.begin() + idxVec[i + 1]);
}
/// merge all ranges of vector
for (int64_t i = 1; i != idxVec.size() - 1; ++i) {
mergeVec(
A.begin(), A.begin() + idxVec[i],
A.begin() + idxVec[i], A.begin() + idxVec[i + 1]
);
}
}
template<typename theIterator, typename T> void
mergeSort_cpu(theIterator begin, theIterator end, T args) {
/* cpu version of the merge sort */
if (end - 1 - begin < 1) return;
vector<T> tmp(end - begin, 0);
theIterator left = begin, right = end - 1;
theIterator mid = left + (right - left) / 2;
mergeSort_cpu(begin, mid + 1, args);
mergeSort_cpu(mid + 1, end, args);
theIterator i = begin;
theIterator j = mid + 1;
theIterator k = tmp.begin();
while(i <= mid && j < end) {
if (*i <= *j) {
*k = *i;
++i; ++k;
} else {
*k = *j;
++j; ++k;
}
}
if (i > mid) {
for (; j < end; ++j, ++k) {
*k = *j;
}
} else {
for (; i <= mid; ++i, ++k) {
*k = *i;
}
}
for (i = begin, k = tmp.begin(); i != end; ++i, ++k) {
*i = *k;
}
}
template<typename theIterator> inline void
mergeSort_cpu(theIterator begin, theIterator end) {
mergeSort_cpu(begin, end, *begin);
}
template<typename theIterator> void
print(theIterator begin, theIterator end) {
int64_t showNums = 10;
if (end - begin <= showNums) {
for (theIterator i = begin; i != end; ++i) {
cout << *i << ", ";
} cout << endl;
} else {
for (theIterator i = begin; i != begin + showNums / 2; ++i) {
cout << *i << ", ";
} cout << "......, ";
for (theIterator i = end - showNums / 2; i != end; ++i) {
cout << *i << ", ";
} cout << endl;
}
}
int main() {
clock_t start, end;
// vector<double> A(pow(2, 20) * 16), B(pow(2, 20) * 16);
// vector<double> A(19), B(19);
vector<long long> A(BIG), B(BIG), C(BIG);
for (int64_t i = A.size() - 1; i != -1; --i) {
// A[i] = A.size() - 1 - i;
A[i] = rand();
C[i] = B[i] = A[i];
}
cout << "initially, A = ";
print(A.begin(), A.end());
start = clock(); // begin cuda computation
mergeSort_gpu(A);
end = clock(); // end cuda computation
cout << "using GPU, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
cout << "after sort, A = ";
print(A.begin(), A.end());
/// use cpu to sort
start = clock();
mergeSort_cpu(B.begin(), B.end());
end = clock();
cout << "using CPU, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
cout << "after sort, B = ";
print(B.begin(), B.end());
/// use sort algorithm of stl
start = clock();
stable_sort(C.begin(), C.end());
end = clock();
cout << "using CPU, stl::stable_sort, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
cout << "after sort, C = ";
print(C.begin(), C.end());
/// check whether A equals C
bool equal = true;
for (int64_t i = 0; i != A.size(); ++i) {
if (A[i] != C[i]) {
equal = false;
break;
}
}
if (!equal) {
cerr << "\033[31;1m there is a bug in the program, A != C. \033[0m" << endl;
} else {
cout << "\033[32;1m very good, A == C. \033[0m" << endl;
}
}
测试发现优化结果并不显著,主要是由于 shared memory大小的限制,前面只有约10步的合并可以在block内部进行;而后面block间的合并,与reduce操作有所不同,因为随着合并排序的进行,每段数组的size都越来越大,即一个线程要处理的数据量会越来越大,没法放入shared memory中加速,因此只能使用block间的并行
归并排序归根结底是efficiency (= 加速比 / 核数) 比较低的算法,如果想要实现比STL 快很多的排序算法,估计得上 bitonic sort (双调排序),这里先挖个坑,后面来填。我最近尝试实现gpu版本的 bitonic sort , 发现一个亿的数组排序可以比 STL排序快几十倍 (虽然 bitonic sort 的总复杂度work complexity比较高,达到 N * log N * log N, 但是其并行度是在是高,加速比可以达到 p 倍, p 是核数,因此速度远高于各种其它的基于比较的排序算法)。最近时间比较紧,没有时间写博客,后面准备花点时间把 bitonic sort 的学习笔记和代码整理一下再发出来