本帖最后由 guosl 于 2022-1-1 21:10 编辑
再接下来我们用GPU的CUDA编程来实现这个程序,并通过这个例子来初步了解一下CUDA与OpenACC的不同。对于编写OpenACC程序其实只要写好原始的C++程序,然后通过加入编译的导语来实现GPU的代码实现,优点是是比较容易编程,缺点是对于逻辑复杂的程序用OpenACC总是有点放不开手脚,因为里面的GPU代码是由编译器来实现的,你根本就不知道具体的代码是啥,有时候人工智能就是某种“人工智障”,所以效率可能不高。这个时候用CUDA来写就会有一种正好挠到痒处的感觉 。
下面的代码是按Nvidia公司的CUDA规范写的,在GTX970的GPU的Unbuntu20.04系统上编译运行的。#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
const long long N = 600851475143LL;
const long long D = 775146LL;
const int nBlockDim = 128;
const int nBlocks = 128;
__device__ bool isprime(long long n)
{
if ((n & 1) == 0)
return false;
long long d = (long long)sqrt((double)n);
for (long long i = 3; i <= d; i += 2)
{
if (n % i == 0)
return false;
}
return true;
}
__global__ void findprime(long long s, long long* r)
{
__shared__ long long lm[nBlocks];
long long M = 0;
int n = s + threadIdx.x + blockIdx.x * nBlockDim;
if (n <= D)
{
if (N % n == 0)
{
long long K = N / n;
if (isprime(K))
M = (K > M) ? K : M;
else if (isprime(n))
M = (n > M) ? n : M;
}
}
lm[threadIdx.x] = M;
__syncthreads();
n = nBlockDim / 2;
while (n > 0)
{
if (threadIdx.x < n)
{
if (lm[threadIdx.x] < lm[threadIdx.x + n])
lm[threadIdx.x] = lm[threadIdx.x + n];
}
n /= 2;
__syncthreads();
}
if (threadIdx.x == 0)
r[blockIdx.x] = lm[0];
}
int main(void)
{
long long* pM = NULL;
long long* p_dM = NULL;
long long s = 2, nMaxPrime = 2;
cudaError_t cudaStatus;
int nPId = 0;
cudaStatus = cudaSetDevice(nPId);//设置计算卡
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaDeviceProp deviceProp;
cudaStatus = cudaGetDeviceProperties(&deviceProp, nPId);
if (cudaStatus != cudaSuccess)
goto Error;
printf("GPU Device %d: "%s" with compute capability %d.%d\n\n", nPId, deviceProp.name, deviceProp.major, deviceProp.minor);
cudaStatus = cudaSetDevice(0);
//在主机上分配锁定内存
cudaStatus = cudaHostAlloc((void**)&pM, nBlocks * sizeof(long long), cudaHostAllocDefault);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CUDA alloc memory failed!");
goto Error;
}
//在显卡上分配内存
cudaStatus = cudaMalloc((void**)&p_dM, nBlocks * sizeof(long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
while (s <= D)
{
findprime << <nBlocks, nBlockDim >> > (s, p_dM);//进行显卡计算
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();//等待显卡的计算完成
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(pM, p_dM, nBlocks * sizeof(long long), cudaMemcpyDeviceToHost);//从显卡取回计算的结果
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for (int i = 0; i < nBlocks; ++i) //归并结果
nMaxPrime = (nMaxPrime > pM[i]) ? nMaxPrime : pM[i];
s += nBlocks * nBlockDim;
}
printf("%lld\n", nMaxPrime);
Error:
if (p_dM != NULL)
{
cudaFree(p_dM);
p_dM = NULL;
}
if (pM != NULL)
{
cudaFreeHost(pM);
pM = NULL;
}
return 0;
}
运行时间是:0.001818秒,答案是:6857。在这里看来人工智能还是比不过了人工。 |