奇偶排序使用CUDA加速

tech2024-07-29  64

奇偶排序

可以证明奇偶排序是一种使序列逆序数降为0(全局最优)的迭代算法,当连续一轮奇排列和偶排列完成后,交换的元素个数都为0个,则排序已收敛到全局最优。 考虑到每次奇或偶排列需要LENGTH/2组比较,且在每次排序中,组之间没有依赖关系,于是在序列长度较大时,使用CUDA对其加速成为可能,将每一组比较对作为一个CUDA线程执行,于是奇排序有如下代码:

int curLocation = *tid % *length; if(!(curLocation % 2) && curLocation != *length-1) if(array[curLocation] > array[curLocation+1]) { int temp = array[curLocation]; atomicExch(&array[curLocation], array[curLocation+1]); atomicExch(&array[curLocation+1], temp); atomicAdd(vOdd, 1); }

偶排序有如下代码:

int curLocation = *tid % *length; if(curLocation % 2 == 1 && curLocation != *length-1) if(array[curLocation] > array[curLocation+1]) { int temp = array[curLocation]; atomicExch(&array[curLocation], array[curLocation+1]); atomicExch(&array[curLocation+1], temp); atomicAdd(vEven, 1); }

每个线程执行一组奇、偶排序,考虑到偶排序需要在奇排序完成之后才能执行,因此需要使用线程同步函数,由于GRID中只设置了一个BLOCK,于是线程同步函数就可以保证所有分组比较全部完成,代码如下:

//printf("4\n"); int tId = threadIdx.x + blockIdx.x * blockDim.x; //printf("tID: %d\n", tId); oddSort(array, length, &tId, vOdd); // 等待Block内所有线程完成奇排序 __syncthreads(); evenSort(array, length, &tId, vEven); // 等待Block内所有线程完成偶排序 __syncthreads();

于是整个CUDA并行加速奇偶排序算法如下所示:

#include <cuda.h> #include <cuda_runtime_api.h> #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <iostream> using namespace std; #define LENGTH 1024 __device__ void oddSort(int *array, int *length, int* tid, int *vOdd) { int curLocation = *tid % *length; if(!(curLocation % 2) && curLocation != *length-1) if(array[curLocation] > array[curLocation+1]) { int temp = array[curLocation]; atomicExch(&array[curLocation], array[curLocation+1]); atomicExch(&array[curLocation+1], temp); atomicAdd(vOdd, 1); } } __device__ void evenSort(int *array, int *length, int* tid, int *vEven) { int curLocation = *tid % *length; if(curLocation % 2 == 1 && curLocation != *length-1) if(array[curLocation] > array[curLocation+1]) { int temp = array[curLocation]; atomicExch(&array[curLocation], array[curLocation+1]); atomicExch(&array[curLocation+1], temp); atomicAdd(vEven, 1); } } __global__ void oddEvenSort(int *array, int *length, int *vOdd, int* vEven) { //printf("4\n"); int tId = threadIdx.x + blockIdx.x * blockDim.x; //printf("tID: %d\n", tId); oddSort(array, length, &tId, vOdd); // 等待Block内所有线程完成奇排序 __syncthreads(); evenSort(array, length, &tId, vEven); // 等待Block内所有线程完成偶排序 __syncthreads(); } extern "C" void hostOddEvenSort(int *array, int length, int odd, int even) { int *arrayDev, *lengthDev, *vOdd, *vEven; cudaMalloc(&arrayDev, length*sizeof(int)); cudaMemcpy(arrayDev, array, length*sizeof(int), cudaMemcpyHostToDevice); cudaMalloc(&lengthDev, sizeof(int)); cudaMemcpy(lengthDev, &length, sizeof(int), cudaMemcpyHostToDevice); cudaMalloc(&vOdd, sizeof(int)); cudaMalloc(&vEven, sizeof(int)); int *arrayHost, *vOddHost, *vEvenHost; arrayHost = (int*)malloc(length*sizeof(int)); vOddHost = (int*)malloc(sizeof(int)); vEvenHost = (int*)malloc(sizeof(int)); do { cudaMemcpy(vOdd, &odd, sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(vEven, &even, sizeof(int), cudaMemcpyHostToDevice); //cout<< 2<< endl; oddEvenSort<<<1, LENGTH>>>(arrayDev, lengthDev, vOdd, vEven); //cout<< 3<< endl; cudaMemcpy(vOddHost, vOdd, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(vEvenHost, vEven, sizeof(int), cudaMemcpyDeviceToHost); cout<< "odd: "<< *vOddHost<< "even: "<< *vEvenHost<< endl; }while(*vOddHost!=0 || *vEvenHost!=0); cudaMemcpy(arrayHost, arrayDev, length*sizeof(int), cudaMemcpyDeviceToHost); for(int i=0; i<length; i++) cout<< arrayHost[i]<< ","; cout<< "."<< endl; } int main() { cout<< "begin: "<< endl; int *array = new int[LENGTH]; // cout<< -1<< endl; // 逆序数输入 for(int i=0; i<LENGTH; i++) array[i] = LENGTH - i; // cout<< 0<< endl; // 奇偶排序后顺序输出 hostOddEvenSort(array, LENGTH, 0, 0); return 0; }
最新回复(0)