本帖最后由 guosl 于 2022-10-16 11:16 编辑
我们用CUDA进行死算。
答案:16576/*
答案:16576
耗时:184.763907s(cmp30hx,最近矿卡非常便宜 )
*/
#include <cstdio>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <helper_cuda.h>
#include <omp.h>
using namespace std;
const long long N = 1000000000000LL;
const int nStep = 46080 * 16; //分段的区间长度
const int nBlocksSize = 128;
const int nBlocks = (nStep + nBlocksSize - 1) / nBlocksSize;
int* result_d = NULL;//存放结果的显卡内存
int* result = NULL;//存放结果的主机内存
int qpow(long long k)
{
long long p;
if (k == 0)
return 1;
if (k == 1)
return 2;
if ((k & 1) == 0)
{
p = qpow(k >> 1);
p *= p;
p %= 100000;
}
else
{
p = qpow(k - 1);
p *= 2;
p %= 100000;
}
return (int)p;
}
__global__ void getpr(long long b, int *result)
{
extern __shared__ long long nVal[];
nVal[threadIdx.x] = 1;
long long k = blockIdx.x * nBlocksSize + threadIdx.x + b;//当前的计算序号
if (k <= N)
{
long long p = k;
while ((p & 1) == 0)
p = p >> 1;
while (p % 5 == 0)
p /= 5;
p %= 100000;
nVal[threadIdx.x] = p;
}
__syncthreads();
//累乘本block内得到的值
for (int k = nBlocksSize >> 1; k > 0; k = k >> 1)
{
if (threadIdx.x < k)
{
nVal[threadIdx.x] *= nVal[threadIdx.x + k];
nVal[threadIdx.x] %= 100000;
}
__syncthreads();
}
if (threadIdx.x == 0)
result[blockIdx.x] = (int)nVal[threadIdx.x];//放入返回的变量中
}
int main(void)
{
long long k = 1, ppr = 1;
long long c2 = 0, c5 = 0, n = N;
double t = omp_get_wtime();
cudaDeviceProp deviceProp;
int devID = 0;
cudaError_t cudaStatus = cudaSetDevice(devID);//设置计算卡
if (cudaStatus != cudaSuccess)
goto end;
cudaStatus = cudaGetDeviceProperties(&deviceProp, devID);
if (cudaStatus != cudaSuccess)
goto end;
printf("GPU Device %d: "%s" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
while (n > 1)
{
c2 += n / 2;
n = n >> 1;
}
n = N;
while (n > 1)
{
c5 += n / 5;
n /= 5;
}
c2 -= c5;
ppr = qpow(c2);
result = new int[nBlocks];
if (result == NULL)
goto end;
cudaStatus = cudaMalloc((void**)&result_d, nBlocks * sizeof(int));
if (cudaStatus != cudaSuccess)
goto end;
while (k <= N)
{
cudaStatus = cudaMemset(result_d, 0, nBlocks * sizeof(int));//将返回的数组清零
if (cudaStatus != cudaSuccess)
goto end;
getpr << <nBlocks, nBlocksSize, nBlocksSize * sizeof(long long) >> > (k, result_d);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
goto end;
cudaDeviceSynchronize();//等待计算完成
//将区间和复制回主机内存
cudaStatus = cudaMemcpy(result, result_d, nBlocks * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
goto end;
for (int i = 0; i < nBlocks; ++i)
{
ppr *= (long long)result[i];
ppr %= 100000;
}
k += nStep;
}
cudaDeviceReset();
t = omp_get_wtime() - t;
printf("%lld\n%lf\n", ppr, t);
end:
if (result != NULL)
delete[] result;
if (result_d != NULL)
cudaFree(result_d);
return 0;
}
|