|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import re
import os
import math
spli_segments=[]
nonspli_segments=[]
def get_segments(Trainingset_path):
for file in os.listdir(Trainingset_path):
file_path=os.path.join(Trainingset_path,file)
with open(file_path,'r')as input:
i=0;
sequence=''
for line in input.readlines():
i+=1
if(i==1):
continue
elif(i==2):
positions=re.findall(r'.*?\.+(\d+)',line)
else:
line=line.strip()
sequence+=line
for position in positions:
x=int(position)
segment=sequence[x-3:x+6]
if(re.search(r'[^atcg]', segment) == None):
spli_segments.append(segment)
while((i+9)<len(sequence)):
segment=sequence[i:i+9]
if (re.search(r'[^atcg]', segment) == None) and len(segment)==9 and i+3 not in positions:
nonspli_segments.append(segment)
i += 1000
return spli_segments,nonspli_segments
spli_segments,nonspli_segments=get_segments('/home/pengt/project1/Training Set')
print len(spli_segments)
print len(nonspli_segments)
coding_dic={'a':[0,0,0,1], 't':[0,0,1,0], 'c':[0,1,0,0], 'g':[1,0,0,0]}
index = range(1,37)
with open('trainingdata', 'w') as f:
for segment in spli_segments:
coding_value = []
line = '1'
for i in range(9):
coding_value += coding_dic[segment[i]]
for i in range(36):
line+=' '+str(index[i])+':'+str(coding_value[i])
line += '\n'
f.write(line)
for segment in nonspli_segments:
coding_value = []
line = '-1'
for i in range(9):
coding_value += coding_dic[segment[i]]
for i in range(36):
line+=' '+str(index[i])+':'+str(value[i])
line += '\n'
f.write(line)
f.close()
IndexError: string index out of range
|
|