|
发表于 2020-5-8 22:12:13
|
显示全部楼层
本帖最后由 guosl 于 2022-10-16 11:16 编辑
我们用CUDA进行死算。
答案:16576
- /*
- 答案:16576
- 耗时:184.763907s(cmp30hx,最近矿卡非常便宜 )
- */
- #include <cstdio>
- #include <cuda_runtime.h>
- #include <device_launch_parameters.h>
- #include <helper_cuda.h>
- #include <omp.h>
- using namespace std;
- const long long N = 1000000000000LL;
- const int nStep = 46080 * 16; //分段的区间长度
- const int nBlocksSize = 128;
- const int nBlocks = (nStep + nBlocksSize - 1) / nBlocksSize;
- int* result_d = NULL;//存放结果的显卡内存
- int* result = NULL;//存放结果的主机内存
- int qpow(long long k)
- {
- long long p;
- if (k == 0)
- return 1;
- if (k == 1)
- return 2;
- if ((k & 1) == 0)
- {
- p = qpow(k >> 1);
- p *= p;
- p %= 100000;
- }
- else
- {
- p = qpow(k - 1);
- p *= 2;
- p %= 100000;
- }
- return (int)p;
- }
- __global__ void getpr(long long b, int *result)
- {
- extern __shared__ long long nVal[];
- nVal[threadIdx.x] = 1;
- long long k = blockIdx.x * nBlocksSize + threadIdx.x + b;//当前的计算序号
- if (k <= N)
- {
- long long p = k;
- while ((p & 1) == 0)
- p = p >> 1;
- while (p % 5 == 0)
- p /= 5;
- p %= 100000;
- nVal[threadIdx.x] = p;
- }
- __syncthreads();
- //累乘本block内得到的值
- for (int k = nBlocksSize >> 1; k > 0; k = k >> 1)
- {
- if (threadIdx.x < k)
- {
- nVal[threadIdx.x] *= nVal[threadIdx.x + k];
- nVal[threadIdx.x] %= 100000;
- }
- __syncthreads();
- }
- if (threadIdx.x == 0)
- result[blockIdx.x] = (int)nVal[threadIdx.x];//放入返回的变量中
- }
- int main(void)
- {
- long long k = 1, ppr = 1;
- long long c2 = 0, c5 = 0, n = N;
- double t = omp_get_wtime();
- cudaDeviceProp deviceProp;
- int devID = 0;
- cudaError_t cudaStatus = cudaSetDevice(devID);//设置计算卡
- if (cudaStatus != cudaSuccess)
- goto end;
- cudaStatus = cudaGetDeviceProperties(&deviceProp, devID);
- if (cudaStatus != cudaSuccess)
- goto end;
- printf("GPU Device %d: "%s" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
- while (n > 1)
- {
- c2 += n / 2;
- n = n >> 1;
- }
- n = N;
- while (n > 1)
- {
- c5 += n / 5;
- n /= 5;
- }
- c2 -= c5;
- ppr = qpow(c2);
- result = new int[nBlocks];
- if (result == NULL)
- goto end;
- cudaStatus = cudaMalloc((void**)&result_d, nBlocks * sizeof(int));
- if (cudaStatus != cudaSuccess)
- goto end;
- while (k <= N)
- {
- cudaStatus = cudaMemset(result_d, 0, nBlocks * sizeof(int));//将返回的数组清零
- if (cudaStatus != cudaSuccess)
- goto end;
- getpr << <nBlocks, nBlocksSize, nBlocksSize * sizeof(long long) >> > (k, result_d);
- cudaError_t cudaStatus = cudaGetLastError();
- if (cudaStatus != cudaSuccess)
- goto end;
- cudaDeviceSynchronize();//等待计算完成
- //将区间和复制回主机内存
- cudaStatus = cudaMemcpy(result, result_d, nBlocks * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess)
- goto end;
- for (int i = 0; i < nBlocks; ++i)
- {
- ppr *= (long long)result[i];
- ppr %= 100000;
- }
- k += nStep;
- }
- cudaDeviceReset();
- t = omp_get_wtime() - t;
- printf("%lld\n%lf\n", ppr, t);
- end:
- if (result != NULL)
- delete[] result;
- if (result_d != NULL)
- cudaFree(result_d);
- return 0;
- }
复制代码 |
|