题目160:阶乘数字的结尾数字
Factorial trailing digitsFor any N, let f(N) be the last five digits before the trailing zeroes in N!.
For example,
9! = 362880 so f(9)=36288
10! = 3628800 so f(10)=36288
20! = 2432902008176640000 so f(20)=17664
Find f(1,000,000,000,000)
题目:
对任意的 N,令定义 f(N) 为 N! ( N 的阶乘)除末尾的 0 以外的最后 5 个数字,比如
9! = 362880所以f(9)=36288
10! = 3628800所以f(10)=36288
20! = 2432902008176640000所以f(20)=17664
求 f(1,000,000,000,000)
本帖最后由 guosl 于 2022-10-16 11:16 编辑
我们用CUDA进行死算。{:10_266:}
答案:16576
/*
答案:16576
耗时:184.763907s(cmp30hx,最近矿卡非常便宜 )
*/
#include <cstdio>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <helper_cuda.h>
#include <omp.h>
using namespace std;
const long long N = 1000000000000LL;
const int nStep = 46080 * 16; //分段的区间长度
const int nBlocksSize = 128;
const int nBlocks = (nStep + nBlocksSize - 1) / nBlocksSize;
int* result_d = NULL;//存放结果的显卡内存
int* result = NULL;//存放结果的主机内存
int qpow(long long k)
{
long long p;
if (k == 0)
return 1;
if (k == 1)
return 2;
if ((k & 1) == 0)
{
p = qpow(k >> 1);
p *= p;
p %= 100000;
}
else
{
p = qpow(k - 1);
p *= 2;
p %= 100000;
}
return (int)p;
}
__global__ void getpr(long long b, int *result)
{
extern __shared__ long long nVal[];
nVal = 1;
long long k = blockIdx.x * nBlocksSize + threadIdx.x + b;//当前的计算序号
if (k <= N)
{
long long p = k;
while ((p & 1) == 0)
p = p >> 1;
while (p % 5 == 0)
p /= 5;
p %= 100000;
nVal = p;
}
__syncthreads();
//累乘本block内得到的值
for (int k = nBlocksSize >> 1; k > 0; k = k >> 1)
{
if (threadIdx.x < k)
{
nVal *= nVal;
nVal %= 100000;
}
__syncthreads();
}
if (threadIdx.x == 0)
result = (int)nVal;//放入返回的变量中
}
int main(void)
{
long long k = 1, ppr = 1;
long long c2 = 0, c5 = 0, n = N;
double t = omp_get_wtime();
cudaDeviceProp deviceProp;
int devID = 0;
cudaError_t cudaStatus = cudaSetDevice(devID);//设置计算卡
if (cudaStatus != cudaSuccess)
goto end;
cudaStatus = cudaGetDeviceProperties(&deviceProp, devID);
if (cudaStatus != cudaSuccess)
goto end;
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
while (n > 1)
{
c2 += n / 2;
n = n >> 1;
}
n = N;
while (n > 1)
{
c5 += n / 5;
n /= 5;
}
c2 -= c5;
ppr = qpow(c2);
result = new int;
if (result == NULL)
goto end;
cudaStatus = cudaMalloc((void**)&result_d, nBlocks * sizeof(int));
if (cudaStatus != cudaSuccess)
goto end;
while (k <= N)
{
cudaStatus = cudaMemset(result_d, 0, nBlocks * sizeof(int));//将返回的数组清零
if (cudaStatus != cudaSuccess)
goto end;
getpr << <nBlocks, nBlocksSize, nBlocksSize * sizeof(long long) >> > (k, result_d);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
goto end;
cudaDeviceSynchronize();//等待计算完成
//将区间和复制回主机内存
cudaStatus = cudaMemcpy(result, result_d, nBlocks * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
goto end;
for (int i = 0; i < nBlocks; ++i)
{
ppr *= (long long)result;
ppr %= 100000;
}
k += nStep;
}
cudaDeviceReset();
t = omp_get_wtime() - t;
printf("%lld\n%lf\n", ppr, t);
end:
if (result != NULL)
delete[] result;
if (result_d != NULL)
cudaFree(result_d);
return 0;
} 本帖最后由 guosl 于 2022-9-28 07:17 编辑
我们再给一种解法:
当n%5==0时有公式:n!=n-!*5^(n/5)*(n/5)!这个可以应用数学归纳法来证明。
其中n-!为普通阶乘中去掉所有5的倍数,例如:10-!= 1*2*3*4*6*7*8*9
由于10^(n/5)只产生一串零,故在计算中舍去。只需计算:n-!/(2^(n/5))*(n/5)!
对于n-!/(2^(n/5))的计算
我们把n个数分成若干段,每段有如下形式
(kM+0)*(kM+1)*...*(kM+4)*(kM+6)*...*(kM+9)*(kM+11)*...*(kM+i)*...*(kM+M-1)
这样的段有n/M段,而每段在%M的作用下都变成了
(1*...*4*6*...*9*11*...*i*...*(M-1))
整个计算变成了
(1*...*4*6*...*9*11*...*i*...*(M-1))^(n/M)
而根据扩展欧拉定理:i^(n/M)=i^(Phi(M)+(n/M)%Phi(M)),其中Phi()为欧拉函数
所以:f(n)=(g(n)*f(n/5))%100000
我们可以根据这个得到下面的递归计算
答案:16576,计算耗时可以忽略不记。{:10_257:}
#include <iostream>
using namespace std;
const long long N = 1000000000000LL;
const int M = 100000, Phi_M = 40000;//Phi_M为M的欧拉函数值
int qpow(int b, int n)//快速幂
{
long long x = b;
if (n == 0)
return 1;
if (n == 1)
return int(x % M);
if ((n & 1) == 0)
{
x = qpow(b, n >> 1);
x *= x;
return int(x % M);
}
else
{
x = qpow(b, n - 1);
x *= b;
return int(x % M);
}
}
int g(long long n)//计算n-!/2^(n/5)的值
{
//下面计算的依据是扩展欧拉定理
long long m1 = n / M;
if (m1 > Phi_M)
m1 = m1 % Phi_M + Phi_M;
long long m2 = (m1 + 1) / 2;
if (m2 > Phi_M)
m2 = m2 % Phi_M + Phi_M;
long long m3 = m1 / 2;
if (m3 > Phi_M)
m3 = m3 % Phi_M + Phi_M;
long long r = 1;
for (int i = 0; i < M; ++i)
{
if (i % 5 == 0)
continue;
//我们将2^(-n/5)分配到乘积中,具体就是个位数是2或4的数除个2
if (i % 10 == 2)
{
r *= qpow(int(i / 2), (int)m2);//处理其中除2的因子,这个要具体计算
r %= M;
r *= qpow(int((M + i) / 2), (int)m3);
}
else if (i % 10 == 4)
{
r *= qpow(int(i / 2), (int)m2);//处理其中除2的因子
r %= M;
r *= qpow(int((M + i) / 2), (int)m3);
}
else
r *= qpow(int(i), (int)m1);
r %= M;
}
return (int)r;
}
int f(long long n)
{
long long r = 1;
if (n % M == 0)
{
r = g(n);
r *= f(n / 5);//递归
r %= M;
return (int)r;
}
//n的值已经减小到可以直接计算了
int c2 = 0;
{
int k = (int)n, c5 = 0;
while (k >= 2)
{
c2 += k / 2;
k /= 2;
}
k = (int)n;
while (k >= 5)
{
c5 += k / 5;
k /= 5;
}
c2 -= c5;
}
r = qpow(2, c2);
for (int i = 1; i <= n; ++i)
{
int k = i;
while ((k & 1) == 0)
k /= 2;
while (k % 5 == 0)
k /= 5;
k %= M;
r *= k;
r %= M;
}
return (int)r;
}
int main(void)
{
cout << f(N) << endl;//调用递归计算
return 0;
}
页:
[1]