|
发表于 2022-1-2 18:25:54
|
显示全部楼层
本帖最后由 guosl 于 2022-1-2 18:33 编辑
本题最快的算法是应用筛法。如果应用朴素的穷举法,在主机(CPU)上运行的时间基本上都在0.1秒左右。但如果用GPU进行编程则我们可以得到眼前一亮的结果。
- /*
- 答案:142913828922
- 耗时:0.014134(GTX970)
- */
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- const int D = 2000000;
- const int nBlockDim = 128;
- const int nBlocks = 128;
- __device__ bool isprime(int n)
- {
- if ((n & 1) == 0)
- return false;
- int d = (int)sqrt((double)n);
- for (int i = 3; i <= d; i += 2)
- {
- if ((n % i) == 0)
- return false;
- }
- return true;
- }
- __global__ void findprime(int s, int* r)
- {
- __shared__ int lm[nBlockDim];
- lm[threadIdx.x] = 0;
- int n = s + 2 * (threadIdx.x + blockIdx.x * nBlockDim);
- if (n <= D)
- {
- if (isprime(n))
- lm[threadIdx.x] = n;
- }
- __syncthreads();
- int k = nBlockDim / 2;
- while (k > 0)
- {
- if (threadIdx.x < k)
- lm[threadIdx.x] += lm[threadIdx.x + k];
- __syncthreads();
- k /= 2;
- }
- if (threadIdx.x == 0)
- r[blockIdx.x] = lm[0];
- }
- int main(void)
- {
- int* pM = NULL;
- int* p_dM = NULL;
- int s = 3;
- long long nSum = 2;
- cudaError_t cudaStatus;
- int nPId = 0;
- cudaStatus = cudaSetDevice(nPId);//设置计算卡
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- cudaDeviceProp deviceProp;
- cudaGetDeviceProperties(&deviceProp, nPId);
- printf("GPU Device %d: "%s" with compute capability %d.%d\n\n",
- nPId, deviceProp.name, deviceProp.major, deviceProp.minor);
- //在显卡上分配内存
- cudaStatus = cudaSetDevice(0);
- cudaStatus = cudaHostAlloc((void**)&pM, nBlocks * sizeof(int), cudaHostAllocDefault);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "CUDA alloc memory failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&p_dM, nBlocks * sizeof(int));
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- while (s <= D)
- {
- findprime << <nBlocks, nBlockDim >> > (s, p_dM);
- cudaDeviceSynchronize();
- cudaMemcpy(pM, p_dM, nBlocks * sizeof(int), cudaMemcpyDeviceToHost);
- for (int i = 0; i < nBlocks; ++i)
- nSum += pM[i];
- s += 2 * nBlocks * nBlockDim;
- }
- printf("%lld\n", nSum);
- Error:
- if (p_dM != NULL)
- {
- cudaFree(p_dM);
- p_dM = NULL;
- }
- if (pM != NULL)
- {
- cudaFreeHost(pM);
- pM = NULL;
- }
- return 0;
- }
复制代码 |
|