|
发表于 2022-1-1 20:42:51
|
显示全部楼层
本帖最后由 guosl 于 2022-1-1 21:10 编辑
再接下来我们用GPU的CUDA编程来实现这个程序,并通过这个例子来初步了解一下CUDA与OpenACC的不同。对于编写OpenACC程序其实只要写好原始的C++程序,然后通过加入编译的导语来实现GPU的代码实现,优点是是比较容易编程,缺点是对于逻辑复杂的程序用OpenACC总是有点放不开手脚,因为里面的GPU代码是由编译器来实现的,你根本就不知道具体的代码是啥,有时候人工智能就是某种“人工智障”,所以效率可能不高。这个时候用CUDA来写就会有一种正好挠到痒处的感觉 。
下面的代码是按Nvidia公司的CUDA规范写的,在GTX970的GPU的Unbuntu20.04系统上编译运行的。
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- const long long N = 600851475143LL;
- const long long D = 775146LL;
- const int nBlockDim = 128;
- const int nBlocks = 128;
- __device__ bool isprime(long long n)
- {
- if ((n & 1) == 0)
- return false;
- long long d = (long long)sqrt((double)n);
- for (long long i = 3; i <= d; i += 2)
- {
- if (n % i == 0)
- return false;
- }
- return true;
- }
- __global__ void findprime(long long s, long long* r)
- {
- __shared__ long long lm[nBlocks];
- long long M = 0;
- int n = s + threadIdx.x + blockIdx.x * nBlockDim;
- if (n <= D)
- {
- if (N % n == 0)
- {
- long long K = N / n;
- if (isprime(K))
- M = (K > M) ? K : M;
- else if (isprime(n))
- M = (n > M) ? n : M;
- }
- }
- lm[threadIdx.x] = M;
- __syncthreads();
- n = nBlockDim / 2;
- while (n > 0)
- {
- if (threadIdx.x < n)
- {
- if (lm[threadIdx.x] < lm[threadIdx.x + n])
- lm[threadIdx.x] = lm[threadIdx.x + n];
- }
- n /= 2;
- __syncthreads();
- }
- if (threadIdx.x == 0)
- r[blockIdx.x] = lm[0];
- }
- int main(void)
- {
- long long* pM = NULL;
- long long* p_dM = NULL;
- long long s = 2, nMaxPrime = 2;
- cudaError_t cudaStatus;
- int nPId = 0;
- cudaStatus = cudaSetDevice(nPId);//设置计算卡
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- cudaDeviceProp deviceProp;
- cudaStatus = cudaGetDeviceProperties(&deviceProp, nPId);
- if (cudaStatus != cudaSuccess)
- goto Error;
- printf("GPU Device %d: "%s" with compute capability %d.%d\n\n", nPId, deviceProp.name, deviceProp.major, deviceProp.minor);
- cudaStatus = cudaSetDevice(0);
- //在主机上分配锁定内存
- cudaStatus = cudaHostAlloc((void**)&pM, nBlocks * sizeof(long long), cudaHostAllocDefault);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "CUDA alloc memory failed!");
- goto Error;
- }
- //在显卡上分配内存
- cudaStatus = cudaMalloc((void**)&p_dM, nBlocks * sizeof(long long));
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- while (s <= D)
- {
- findprime << <nBlocks, nBlockDim >> > (s, p_dM);//进行显卡计算
- cudaStatus = cudaGetLastError();
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
- goto Error;
- }
- cudaStatus = cudaDeviceSynchronize();//等待显卡的计算完成
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
- goto Error;
- }
- cudaStatus = cudaMemcpy(pM, p_dM, nBlocks * sizeof(long long), cudaMemcpyDeviceToHost);//从显卡取回计算的结果
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- for (int i = 0; i < nBlocks; ++i) //归并结果
- nMaxPrime = (nMaxPrime > pM[i]) ? nMaxPrime : pM[i];
- s += nBlocks * nBlockDim;
- }
- printf("%lld\n", nMaxPrime);
- Error:
- if (p_dM != NULL)
- {
- cudaFree(p_dM);
- p_dM = NULL;
- }
- if (pM != NULL)
- {
- cudaFreeHost(pM);
- pM = NULL;
- }
- return 0;
- }
复制代码
运行时间是:0.001818秒,答案是:6857。在这里看来人工智能还是比不过了人工。 |
|