【6.1】blast结果提取

blast outfmt格式为5得到的比对结果的信息是非常全面的,我们需要从这个**.xml文件中提取需要的信息。今天用python提取了一下。

blast_extract.pl代码如下,下载地址:

https://github.com/tiehan/python

import os
os.chdir('E:\\py\\nr提取') #更改文件所在的位置
import re

num=[]
num1=[]
num2=[]
num3=[]
num4=[]
num5=[]
num6=[]
nn=0
#syn1.xml为blast出来outfmt为5的结果,所以有两个地方要改
if os.path.exists("syn1.xml"):
 data=open("syn1.xml")
 out=open('nr_syn.txt','w')
 print("num\tquery\thit\tannoation\tscore\tHsp_identity\tHsp_align-len\tidentity_bili",file=out)
 for each_line in data:
 each_line=each_line.strip()
 if '<Iteration_query-def>' in each_line:
 m=re.search(r"<Iteration_query-def>(.*)</Iteration_query-def>",each_line)
 print(m.group(1),file=out)
 elif '<Hit_num>' in each_line:
 m=re.search(r"<Hit_num>(.*)</Hit_num>",each_line)
 print('\t'+m.group(1),end='\t',file=out)
 elif '<Hit_id>' in each_line:
 m=re.search(r"<Hit_id>(.*)</Hit_id>",each_line)
 print(m.group(1),end='\t',file=out)
 elif '<Hit_def>' in each_line:
 m=re.search(r"<Hit_def>(.*)</Hit_def>",each_line)
 print(m.group(1),end='\t',file=out)
 elif 'Hsp_bit-score>' in each_line:
 m=re.search(r"<Hsp_bit-score>(.*)</Hsp_bit-score>",each_line)
 print(m.group(1),end='\t',file=out)
 elif '<Hsp_identity>' in each_line:
 m=re.search(r"<Hsp_identity>(.*)</Hsp_identity>",each_line)
 num5.append(m.group(1))
 print(m.group(1),end='\t',file=out)
 elif '<Hsp_align-len>' in each_line:
 m=re.search(r"<Hsp_align-len>(.*)</Hsp_align-len>",each_line)
 num6.append(m.group(1))
 print(m.group(1),end='\t',file=out)
 numm= "%.0f"% (int(num5[nn])/int(num6[nn])*100)
 print(numm,file=out)
 nn=nn+1
 data.close()
 out.close()
else:
 print('The data file is missing')

print(len(num))
print(len(num1))
print(len(num2))
print(len(num3))
print(len(num4))
print(len(num5))
print(len(num6))
药企,独角兽,苏州。团队长期招人,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn