大数据代码优化求指导
本帖最后由 flywalt 于 2020-9-9 14:14 编辑import numpy as np
import pandas as pd
import math
import datetime
valdf=pd.read_csv('C:/Users/htlh/Documents/PRIVATE/实验室/定价分析/test.csv')
valdf=valdf.set_index('remain',drop=True)
origindf=pd.read_csv('C:/Users/htlh/Documents/PRIVATE/实验室/定价分析/test3.csv')
origindf=origindf.set_index('code',drop=True)
zdf=pd.DataFrame(columns=origindf.columns,index=origindf.index)
remaindf=pd.DataFrame(columns=origindf.columns,index=origindf.index)
def get_val(date,remain): # get valuation
remain1=math.floor(remain*100)/100
remain2=math.ceil(remain*100)/100
y1=valdf.loc
y2=valdf.loc
val=(remain-remain1)/0.01*(y2-y1)+y1
return val
def days(str1,str2):
date1=datetime.datetime.strptime(str1,"%Y-%m-%d") #get days(remain)
date2=datetime.datetime.strptime(str2,"%Y-%m-%d")
num=(date1-date2).days/365
return num
count=0#进度统计
for i in origindf.index:
exercise=origindf.loc
for j in origindf.columns:
if pd.isnull(origindf.loc): #跳过空值
continue
remain=days(exercise,j) #当前日期-到期/行权日=剩余期限
remaindf.loc=remain
zdf.loc=float(origindf.loc)-get_val(j,remain) #求差
count+=1#进度统计
print(count/len(origindf.index))
大致是这样的函数,其中test1的样式如下
remain 2016/1/4 2016/1/5 2016/1/6 2016/1/7 2016/1/8
0 2.2488 2.2518 2.2338 2.2198 2.2118
0.01 2.5134 2.5124 2.4843 2.4578 2.4064
0.02 2.7779 2.7729 2.7349 2.6959 2.6009
0.03 2.7696 2.7661 2.7311 2.6961 2.6011
0.04 2.7613 2.7593 2.7273 2.6963 2.6013
0.05 2.7467 2.7452 2.721 2.6963 2.6014
0.06 2.7322 2.7312 2.7146 2.6963 2.6015
0.07 2.7176 2.7171 2.7083 2.6964 2.6017
0.08 2.703 2.703 2.702 2.6964 2.6018
0.09 2.7034 2.7034 2.7023 2.6966 2.6048
0.1 2.7038 2.7037 2.7025 2.6969 2.6077
0.11 2.7053 2.7049 2.7033 2.6977 2.6154
0.12 2.7067 2.706 2.704 2.6986 2.6231
0.13 2.7082 2.7072 2.7048 2.6994 2.6308
0.14 2.7097 2.7083 2.7055 2.7003 2.6385
test3的样式如下
code exercise 2016/1/4 2016/1/5 2016/1/6 2016/1/7 2016/1/8 2016/1/11 2016/1/12 2016/1/13
167246.SH 2022/7/22 3.5035 3.4179 3.3838
167282.SH 2023/7/21 2.7691 2.7742 2.6709 2.6438
167260.SH 2023/7/22 4.5035 4.4179 4.3838
114784.SZ 2022/7/21 3.343 3.3271 3.2579 3.1995
102001408.IB 2023/7/23 4.3787 4.3097 4.2306
167286.SH 2023/7/23 5.4579 5.3884 5.3303
102001405.IB 2023/7/23 2.8623 2.7976 2.7916 2.7005 2.6806
102001404.IB 2023/7/23 3.4623 3.4249 3.4034 3.3175 3.3039
102001402.IB 2023/7/23 3.7877 3.7167 3.7022 3.6165 3.582
041659001.IB 2017/1/7 5.4727 5.4558 5.3862 5.3287
011699006.IB 2016/10/2 2.9077 2.8419 2.7977 2.7126 2.7
011699002.IB 2016/10/2 3.1123 3.0564 3.0394 2.95 2.9409
011699004.IB 2016/10/1 3.1213 3.0771 3.0114 2.9938 2.9083 2.8907
041662001.IB 2017/1/5 3.618 3.5885 3.5376 3.5223 3.4519 3.3948
011699005.IB 2016/8/22 6.3755 6.3143 6.2502 6.2681 6.1798 6.1379
011699003.IB 2016/8/3 3.3939 3.3295 3.3187 3.2292 3.1852
刚学会pandas,写的这个for循环真的效率捉鸡,完整数据20000*15000,估计能跑10小时。。。查了一下说把for替换成ndarray的运算效率会大幅提升,但是还不知道该怎么写,求指导,谢谢~~~ 经过一天的钻研搞明白了一些思路,但是有了个新问题
现在正在用apply,比如我想df.apply(def)
这个def是我定义的包含(x,y)两个参数(或者用lambda表达也行),但是在对df应用apply的时候,x不是对应每一列么,怎么才能引用出来y呢?而且我想要引用的y恰好是这一列的列标题
例如:
exercise2016-01-04...2016-01-192016-01-20
code ...
167246.SH 2.0 8.550685... 8.509589 8.506849
167282.SH 2.0 9.547945... 9.506849 9.504110
167260.SH 2.0 9.550685... 9.509589 9.506849
114784.SZ 2.0 8.547945... 8.506849 8.504110
102001408.IB 2.0 9.553425... 9.512329 9.509589
167286.SH 2.0 9.553425... 9.512329 9.509589
102001405.IB 2.0 9.553425... 9.512329 9.509589
102001404.IB 2.0 9.553425... 9.512329 9.509589
102001402.IB 2.0 9.553425... 9.512329 9.509589
041659001.IB 2.0 3.010959... 2.969863 2.967123
011699006.IB 2.0 2.745205... 2.704110 2.701370
011699002.IB 2.0 2.745205... 2.704110 2.701370
011699004.IB 2.0 2.742466... 2.701370 2.698630
041662001.IB 2.0 3.005479... 2.964384 2.961644
011699005.IB 2.0 2.632877... 2.591781 2.589041
011699003.IB 2.0 2.580822... 2.539726 2.536986
我就想把每一列的数值和对应的日期一起丢进apply里面运算,请问有什么办法实现么?
页:
[1]