cuda归并排序的shared memory 优化

在之前的 gpu 归并排序的程序中，没有使用shared memory 进行优化。
后来仔细分析之后发现，前面的好几步合并需要多次读取局部内存，如果加载到shared memory中说不定可以实现一定程度的优化。
整个合并过程可以分为两个阶段（如下图所示）：

第一个阶段：小规模数组的合并
- 在block内部进行，block中用到的数据加载到共享内存，多线程的 thread需在 steps 间同步
- 各个 step 的并行度逐步下降，因为需要合并的数组段数越来越少，空闲线程越来越多。
第二个阶段：大规模数组的合并，使用 block 间的并行。同样，efficiency 会逐步降低，因为随着合并的进行，数组的段数越来越少，能同时发挥作用的进程数也越来越小。

感觉归并排序的合并过程，与 reduce 规约操作有相似之处，reduce 是从多个数到一个数，而归并排序是从多段数组到一段数组。
在这里插入图片描述
基于上述的操作步骤，代码如下：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h> 
#include <stdio.h>
#include <math.h>
#include <vector>
#include <memory>
#include <iostream>
#include <algorithm>
#define BIG (1e7)
// #define DEBUG
using namespace std;
template<typename theIterator> void print(theIterator begin, theIterator end);


template<typename T> 
__device__ void
mergeVec_half_device(T *A, T *tmp, const int64_t vSize) {

    /* splict the vector A into two halfs
     * merge these two half together
     *
     * tmp is a temporary vector to 
     * receive the merge result
     */

    int64_t left = 0;
    int64_t right = left + vSize - 1;
    int64_t mid = (left + right) / 2;

    int64_t i = left, j = mid + 1, k = left;  // index of left half, right half, and the mergeVec
    while ((i <= mid) && (j <= right)) {
        if (A[i] <= A[j]) {
            tmp[k] = A[i];
            ++i; ++k;
        } else {
            tmp[k] = A[j];
            ++j; ++k;
        }
    }
    if (i > mid) {
        for (; j <= right; ++j, ++k) {
            tmp[k] = A[j];
        }
    } else {
        for (; i <= mid; ++i, ++k) {
            tmp[k] = A[i];
        }
    }
    /// copy tmp to A
    for (k = left; k <= right; ++k) {
        A[k] = tmp[k];
    }
}


template<typename T> 
__global__ void
mergeVec_half_global(T *A, T *tmp, const int64_t vSize) {

    /* splict the vector A into two halfs
     * merge these two half together
     *
     * tmp is a temporary vector to 
     * receive the merge result
     */

    int64_t left = blockIdx.x * vSize;
    int64_t right = left + vSize - 1;
    int64_t mid = (left + right) / 2;

    int64_t i = left, j = mid + 1, k = left;  // index of left half, right half, and the mergeVec
    while ((i <= mid) && (j <= right)) {
        if (A[i] <= A[j]) {
            tmp[k] = A[i];
            ++i; ++k;
        } else {
            tmp[k] = A[j];
            ++j; ++k;
        }
    }
    if (i > mid) {
        for (; j <= right; ++j, ++k) {
            tmp[k] = A[j];
        }
    } else {
        for (; i <= mid; ++i, ++k) {
            tmp[k] = A[i];
        }
    }
    /// copy tmp to A
    for (k = left; k <= right; ++k) {
        A[k] = tmp[k];
    }
}


template<typename T> 
__global__ void 
mergeSort_power2n_shared(T *A, T *tmp, const int64_t vSize) {
    /* 
     *  parallely sort each block (with size of power(2, n)) of a vector
     *  each block is sorted hierarchically by syncthreads
     *      (synchronize the thread at each step of merge vector)
     *  when the thread finishes, the sort of each block is finished.
    */

    /// copy A and tmp to the shared memory
    extern __shared__ T sharedArray[];
    T *s_A = &(sharedArray[0]);
    T *s_tmp = &(sharedArray[blockDim.x]);
    // __shared__ T s_A[blockDim.x], s_tmp[blockDim.x];
    s_A[threadIdx.x] = A[blockIdx.x * blockDim.x + threadIdx.x];
    s_tmp[threadIdx.x] = tmp[blockIdx.x * blockDim.x + threadIdx.x];
    __syncthreads();

    /// merge hierarchically
    for (int64_t i = 2; i <= blockDim.x; i <<= 1) {  // i is the size of vector to be sorted at each step
        if (threadIdx.x % i == 0) {
            mergeVec_half_device(&(s_A[threadIdx.x]), &(s_tmp[threadIdx.x]), i);
        }
        __syncthreads();
    }
    /// data from shared_memory to global memory
    A[blockIdx.x * blockDim.x + threadIdx.x] = s_A[threadIdx.x];
    tmp[blockIdx.x * blockDim.x + threadIdx.x] = s_tmp[threadIdx.x];
    __syncthreads();
}


template<typename theIterator, typename T> void 
mergeSort_power2n(theIterator begin, theIterator end, T args) {
    /* 
        sort a vector with size of power(2, n)
    */
    clock_t begT, endT;
    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, 0);

    T *dataA, *dataTmp;
    const int64_t vSize = end - begin;
    cudaMalloc((void**)&dataA, sizeof(*begin) * vSize);
    cudaMalloc((void**)&dataTmp, sizeof(*begin) * vSize);

    #ifdef DEBUG
    int64_t n = 0;
    if (vSize >= 2) {
        for (int64_t i = 1; i < vSize; i <<= 1) {
            n += 1;
        }
    } else {
        return;
    }
    /// check whether n is correct
    if (((int64_t)1 << n) > vSize) {
        cerr << "\033[31;1m error! vSize != 2 ** n \033[0m";
        exit(-1);
    }
    #endif

    begT = clock();
    cudaMemcpy(dataA, &(*begin), sizeof(*begin) * vSize, cudaMemcpyHostToDevice);

    /// merge hierarchically
    int64_t shared_nums = devProp.sharedMemPerBlock / sizeof(*begin);
    int64_t blockSize, maxBlockSize;  // threads per block
    int64_t gridSize = 1;  // blocks per grid

    /* here each thread needs 2 nums of shared memory, 
       hence, use maximum size of shared memory to define the max block size */
    maxBlockSize = shared_nums / 2;  // partition by array and tmp
    /* the maximum block size can not exceed maxThreadsPerBlock of device  */
    maxBlockSize = min(maxBlockSize, (int64_t)devProp.maxThreadsPerBlock);  

    if (vSize > maxBlockSize) {  // vSize very big
        for (blockSize = vSize; blockSize > maxBlockSize; ) {
            blockSize >>= 1;
        }
        gridSize = (vSize + blockSize - 1) / blockSize;
    } else {  // vSize not that big
        blockSize = vSize;
        gridSize = 1;
    }

    #ifdef DEBUG
        cout << "devProp.sharedMemPerBlock = " << devProp.sharedMemPerBlock << endl;
        cout << "blockSize = " << blockSize << ", gridSize = " << gridSize << endl;
        cout << "vSize = " << vSize << ", maxBlockSize = " << maxBlockSize << endl;
        cout << "sizeof(*begin) * blockSize * 2 = " << sizeof(*begin) * blockSize * 2 << endl;
    #endif

    /// sort each block respectively
    mergeSort_power2n_shared <<<gridSize, blockSize, sizeof(*begin) * blockSize * 2>>> (dataA, dataTmp, vSize);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        cerr << "mergeSort_power2n_shared, CUDA Error: " << cudaGetErrorString(err) << endl;
        // Possibly: exit(-1) if program cannot continue....
    } 

    /// the parallelly merge all blocks
    for (int64_t i = blockSize * 2; i <= vSize; i <<= 1) {
        mergeVec_half_global <<<vSize / i, 1>>> (dataA, dataTmp, i);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            cerr << "mergeVec_half_global, CUDA Error: " << cudaGetErrorString(err) << endl;
            // Possibly: exit(-1) if program cannot continue....
        } 
        // cudaDeviceSynchronize();  // no need to synchronize here, because kernel2 will wait for kernel1
        #ifdef DEBUG
            cudaMemcpy(&(*begin), dataA, sizeof(*begin) * vSize, cudaMemcpyDeviceToHost);
            cout << "merging Vector, vec = ";
            print(begin, end);
        #endif
    }

    /// data from device to host
    cudaMemcpy(&(*begin), dataA, sizeof(*begin) * vSize, cudaMemcpyDeviceToHost);
    endT = clock();
    cout << "inside GPU operation, time = " << endT - begT << endl;

    cudaFree(dataA);
    cudaFree(dataTmp);
}
template<typename theIterator> inline void 
mergeSort_power2n(theIterator begin, theIterator end) {
    mergeSort_power2n(begin, end, *begin);
}


template<typename theIterator, typename T> void
mergeVec(
    theIterator beg1, theIterator end1,
    theIterator beg2, theIterator end2,
    T args
) {
    /* 
     * merge 2 vectors with arbitrary length
     * of each vector
     */
    vector<T> tmp((end1 - beg1) + (end2 - beg2));
    theIterator i = beg1, j = beg2;
    theIterator k = tmp.begin();

    while(i != end1 && j != end2) {
        if (*i <= *j) {
            *k = *i;
            ++i; ++k;
        } else {
            *k = *j;
            ++j; ++k;
        }
    }
    if (i == end1) {
        while (j != end2) {
            *k = *j;
            ++j; ++k;
        }
    } else {
        while (i != end1) {
            *k = *i;
            ++i; ++k;
        }
    }
    /// copy tmp to original vectors
    k = tmp.begin();
    for (i = beg1; i != end1; ++i, ++k) {
        *i = *k;
    }
    for (j = beg2; j != end2; ++j, ++k) {
        *j = *k;
    }
}
template<typename theIterator> inline void 
mergeVec(theIterator beg1, theIterator end1, theIterator beg2, theIterator end2) {
    mergeVec(beg1, end1, beg2, end2, *beg1);
}


template<typename vec> void 
mergeSort_gpu(vec &A) {
    /* can deal with arbitary size of vector */
    vector<bool> binA;
    int64_t vSize = A.size(), n = A.size();
    int64_t one = 1;
    while (n > 0) {
        if (n & one) {
            binA.push_back(true);
        } else {
            binA.push_back(false);
        }
        n >>= 1;
    }

    vector<int64_t> idxVec;
    idxVec.push_back(0);
    for (int64_t i = 0; i != binA.size(); ++i) {
        if (binA[i]) {
            idxVec.push_back(idxVec.back() + (one << i));
        }
    }

    for (int64_t i = 0; i != idxVec.size() - 1; ++i) {
        mergeSort_power2n(A.begin() + idxVec[i], A.begin() + idxVec[i + 1]);
    }
    /// merge all ranges of vector
    for (int64_t i = 1; i != idxVec.size() - 1; ++i) {
        mergeVec(
            A.begin(), A.begin() + idxVec[i],
            A.begin() + idxVec[i], A.begin() + idxVec[i + 1]
        );
    }
}


template<typename theIterator, typename T> void 
mergeSort_cpu(theIterator begin, theIterator end, T args) {

    /* cpu version of the merge sort */

    if (end - 1 - begin < 1) return;

    vector<T> tmp(end - begin, 0);

    theIterator left = begin, right = end - 1;
    theIterator mid = left + (right - left) / 2;

    mergeSort_cpu(begin, mid + 1, args);
    mergeSort_cpu(mid + 1, end, args);

    theIterator i = begin;
    theIterator j = mid + 1;
    theIterator k = tmp.begin();
    
    while(i <= mid && j < end) {
        if (*i <= *j) {
            *k = *i;
            ++i; ++k;
        } else {
            *k = *j;
            ++j; ++k;
        }
    }
    if (i > mid) {
        for (; j < end; ++j, ++k) {
            *k = *j;
        }
    } else {
        for (; i <= mid; ++i, ++k) {
            *k = *i;
        }
    }
    for (i = begin, k = tmp.begin(); i != end; ++i, ++k) {
        *i = *k;
    }
}
template<typename theIterator> inline void 
mergeSort_cpu(theIterator begin, theIterator end) {
    mergeSort_cpu(begin, end, *begin);
}


template<typename theIterator> void 
print(theIterator begin, theIterator end) {
    int64_t showNums = 10;
    if (end - begin <= showNums) {
        for (theIterator i = begin; i != end; ++i) {
            cout << *i << ", ";
        } cout << endl;
    } else {
        for (theIterator i = begin; i != begin + showNums / 2; ++i) {
            cout << *i << ", ";
        } cout << "......, ";
        for (theIterator i = end - showNums / 2; i != end; ++i) {
            cout << *i << ", ";
        } cout << endl;
    }
}


int main() {

    clock_t start, end;

    // vector<double> A(pow(2, 20) * 16), B(pow(2, 20) * 16);
    // vector<double> A(19), B(19);
    vector<long long> A(BIG), B(BIG), C(BIG);
    for (int64_t i = A.size() - 1; i != -1; --i) {
        // A[i] = A.size() - 1 - i;
        A[i] = rand();
        C[i] = B[i] = A[i];
    }

    cout << "initially, A = ";
    print(A.begin(), A.end());

    start = clock();  // begin cuda computation
    mergeSort_gpu(A);
    end = clock();  // end cuda computation
    cout << "using GPU, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
    cout << "after sort, A = ";
    print(A.begin(), A.end());

    /// use cpu to sort
    start = clock();
    mergeSort_cpu(B.begin(), B.end());
    end = clock();
    cout << "using CPU, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
    cout << "after sort, B = ";
    print(B.begin(), B.end());

    /// use sort algorithm of stl
    start = clock();
    stable_sort(C.begin(), C.end());
    end = clock();
    cout << "using CPU, stl::stable_sort, consuming time = " << (end - start) * 1000. / CLOCKS_PER_SEC << " ms" << endl;
    cout << "after sort, C = ";
    print(C.begin(), C.end());

    /// check whether A equals C
    bool equal = true;
    for (int64_t i = 0; i != A.size(); ++i) {
        if (A[i] != C[i]) {
            equal = false;
            break;
        }
    }
    if (!equal) {
        cerr << "\033[31;1m there is a bug in the program, A != C. \033[0m" << endl;
    } else {
        cout << "\033[32;1m very good, A == C. \033[0m" << endl;
    }
}

测试发现优化结果并不显著，主要是由于 shared memory大小的限制，前面只有约10步的合并可以在block内部进行；而后面block间的合并，与reduce操作有所不同，因为随着合并排序的进行，每段数组的size都越来越大，即一个线程要处理的数据量会越来越大，没法放入shared memory中加速，因此只能使用block间的并行

归并排序归根结底是efficiency (= 加速比 / 核数) 比较低的算法，如果想要实现比STL 快很多的排序算法，估计得上 bitonic sort (双调排序)，这里先挖个坑，后面来填。我最近尝试实现gpu版本的 bitonic sort , 发现一个亿的数组排序可以比 STL排序快几十倍（虽然 bitonic sort 的总复杂度work complexity比较高，达到 N * log N * log N, 但是其并行度是在是高，加速比可以达到 p 倍, p 是核数，因此速度远高于各种其它的基于比较的排序算法）。最近时间比较紧，没有时间写博客，后面准备花点时间把 bitonic sort 的学习笔记和代码整理一下再发出来

shared memory 优化 gpu 的 归并排序 merge sort

cuda归并排序的shared memory 优化

shared memory 优化 gpu 的归并排序 merge sort