|
import argparse |
|
|
|
|
|
def p_m_c(input_path, output_path, ca_only): |
|
import numpy as np |
|
import os, time, gzip, json |
|
import glob |
|
|
|
|
|
save_path = output_path |
|
|
|
|
|
|
|
alpha_1 = list("ARNDCQEGHILKMFPSTWYV-") |
|
states = len(alpha_1) |
|
alpha_3 = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', |
|
'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'GAP'] |
|
|
|
aa_1_N = {a: n for n, a in enumerate(alpha_1)} |
|
aa_3_N = {a: n for n, a in enumerate(alpha_3)} |
|
aa_N_1 = {n: a for n, a in enumerate(alpha_1)} |
|
aa_1_3 = {a: b for a, b in zip(alpha_1, alpha_3)} |
|
aa_3_1 = {b: a for a, b in zip(alpha_1, alpha_3)} |
|
|
|
def AA_to_N(x): |
|
|
|
x = np.array(x); |
|
if x.ndim == 0: x = x[None] |
|
return [[aa_1_N.get(a, states - 1) for a in y] for y in x] |
|
|
|
def N_to_AA(x): |
|
|
|
x = np.array(x); |
|
if x.ndim == 1: x = x[None] |
|
return ["".join([aa_N_1.get(a, "-") for a in y]) for y in x] |
|
|
|
def parse_PDB_biounits(x, atoms=['N', 'CA', 'C'], chain=None): |
|
''' |
|
input: x = PDB filename |
|
atoms = atoms to extract (optional) |
|
output: (length, atoms, coords=(x,y,z)), sequence |
|
''' |
|
xyz, seq, min_resn, max_resn = {}, {}, 1e6, -1e6 |
|
for line in open(x, "rb"): |
|
line = line.decode("utf-8", "ignore").rstrip() |
|
|
|
if line[:6] == "HETATM" and line[17:17 + 3] == "MSE": |
|
line = line.replace("HETATM", "ATOM ") |
|
line = line.replace("MSE", "MET") |
|
|
|
if line[:4] == "ATOM": |
|
ch = line[21:22] |
|
if ch == chain or chain is None: |
|
atom = line[12:12 + 4].strip() |
|
resi = line[17:17 + 3] |
|
resn = line[22:22 + 5].strip() |
|
x, y, z = [float(line[i:(i + 8)]) for i in [30, 38, 46]] |
|
|
|
if resn[-1].isalpha(): |
|
resa, resn = resn[-1], int(resn[:-1]) - 1 |
|
else: |
|
resa, resn = "", int(resn) - 1 |
|
|
|
if resn < min_resn: |
|
min_resn = resn |
|
if resn > max_resn: |
|
max_resn = resn |
|
if resn not in xyz: |
|
xyz[resn] = {} |
|
if resa not in xyz[resn]: |
|
xyz[resn][resa] = {} |
|
if resn not in seq: |
|
seq[resn] = {} |
|
if resa not in seq[resn]: |
|
seq[resn][resa] = resi |
|
|
|
if atom not in xyz[resn][resa]: |
|
xyz[resn][resa][atom] = np.array([x, y, z]) |
|
|
|
|
|
seq_, xyz_ = [], [] |
|
try: |
|
for resn in range(min_resn, max_resn + 1): |
|
if resn in seq: |
|
for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k], 20)) |
|
else: |
|
seq_.append(20) |
|
if resn in xyz: |
|
for k in sorted(xyz[resn]): |
|
for atom in atoms: |
|
if atom in xyz[resn][k]: |
|
xyz_.append(xyz[resn][k][atom]) |
|
else: |
|
xyz_.append(np.full(3, np.nan)) |
|
else: |
|
for atom in atoms: xyz_.append(np.full(3, np.nan)) |
|
return np.array(xyz_).reshape(-1, len(atoms), 3), N_to_AA(np.array(seq_)) |
|
except TypeError: |
|
return 'no_chain', 'no_chain' |
|
|
|
pdb_dict_list = [] |
|
c = 0 |
|
|
|
|
|
|
|
|
|
init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', |
|
'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', |
|
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] |
|
extra_alphabet = [str(item) for item in list(np.arange(300))] |
|
chain_alphabet = init_alphabet + extra_alphabet |
|
|
|
|
|
for input in input_path: |
|
my_dict = {} |
|
s = 0 |
|
concat_seq = '' |
|
concat_N = [] |
|
concat_CA = [] |
|
concat_C = [] |
|
concat_O = [] |
|
concat_mask = [] |
|
coords_dict = {} |
|
for letter in chain_alphabet: |
|
if ca_only: |
|
sidechain_atoms = ['CA'] |
|
else: |
|
sidechain_atoms = ['N', 'CA', 'C', 'O'] |
|
xyz, seq = parse_PDB_biounits(input.name, atoms=sidechain_atoms, chain=letter) |
|
if type(xyz) != str: |
|
concat_seq += seq[0] |
|
my_dict['seq_chain_' + letter] = seq[0] |
|
coords_dict_chain = {} |
|
if ca_only: |
|
coords_dict_chain['CA_chain_' + letter] = xyz.tolist() |
|
else: |
|
coords_dict_chain['N_chain_' + letter] = xyz[:, 0, :].tolist() |
|
coords_dict_chain['CA_chain_' + letter] = xyz[:, 1, :].tolist() |
|
coords_dict_chain['C_chain_' + letter] = xyz[:, 2, :].tolist() |
|
coords_dict_chain['O_chain_' + letter] = xyz[:, 3, :].tolist() |
|
my_dict['coords_chain_' + letter] = coords_dict_chain |
|
s += 1 |
|
na = input.name |
|
|
|
fi = na.rfind("/") |
|
my_dict['name'] = na[(fi + 1):(fi+5)] |
|
my_dict['num_of_chains'] = s |
|
my_dict['seq'] = concat_seq |
|
if s < len(chain_alphabet): |
|
pdb_dict_list.append(my_dict) |
|
c += 1 |
|
|
|
with open(save_path, 'w') as f: |
|
for entry in pdb_dict_list: |
|
f.write(json.dumps(entry) + '\n') |
|
return save_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|