pdb文件是蛋白质结构存储的标准文件,用txt打开其主要内容如下所示
上图部分是model1下的蛋白原子分布,序号1所在列是原子序号,序号2所在列是氨基酸种类,序号3所在列是氨基酸序号,序号4所在三列是原子的三维坐标,序号5所在列是原子种类
用python实现提取序列:
from Bio import PDB
'''本文件用于提取单个pdb文件的氨基酸'''
parser = PDB.PDBParser()
io = PDB.PDBIO()
fileName_save = 'HHH_b1_00224.fasta'
nameOfFile_read = 'HHH_b1_00224.pdb'
'''下面将氨基酸全称改为简写'''
longToShort = {'GLY': 'G', 'ALA': 'A', 'VAL': 'V', 'LEU': 'L', 'ILE': 'I', 'PHE': 'F', 'TRP': 'W', 'TYR': 'Y','ASP': 'D' ,'HIS': 'H', 'ASN': 'N', 'GLU': 'E', 'LYS': 'K', 'GLN': 'Q', 'MET': 'M', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'CYS': 'C', 'PRO': 'P', 'SEC': 'U', 'PYL': 'O'}
struct = parser.get_structure('aaaa', nameOfFile_read) '''此处的aaaa是结构名,无实际意义'''
with open(fileName_save, 'a+') as f_w:
f_w.write('>sequence' + str(i+1) + '\n')
for model in struct: #遍历所有model
for chain in model: #遍历model中的所有chain
for residue in chain: #遍历chain中的所有残基
resname = residue.get_resname() #读取氨基酸
f_w.write(longToShort[resname])
若要读取原子坐标,只需在 for residue in chain下增加一个遍历所有原子即可,下述代码实现读取骨架原子坐标
for model in struct:
for chain in model:
for residue in chain:
for atom in residue:
if(num < 3):
x,y,z = atom.get_coord()
with open(filename, 'a+') as file_object:
#file_object.write('NaN' + ", " + 'NaN' + ", " + 'NaN' + "], [")
file_object.write(str(x) + ", " + str(y) + ", " + str(z) + "], [")
num = num + 1
elif(num == 3):
x, y, z = atom.get_coord()
with open(filename, 'a+') as file_object:
#file_object.write('NaN' + ", " + 'NaN' + ", " + 'NaN' + "]]")
file_object.write(str(x) + ", " + str(y) + ", " + str(z) + "]]")
num = num + 1