Upload 5 files
Browse files- ProteinMPNN-main/helper_scripts/assign_fixed_chains.py +40 -0
- ProteinMPNN-main/helper_scripts/make_bias_AA.py +28 -0
- ProteinMPNN-main/helper_scripts/make_fixed_positions_dict.py +60 -0
- ProteinMPNN-main/helper_scripts/make_tied_positions_dict.py +61 -0
- ProteinMPNN-main/helper_scripts/parse_multiple_chains.py +167 -0
ProteinMPNN-main/helper_scripts/assign_fixed_chains.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
def a_f_c(input_path, output_path, chain_list):
|
4 |
+
import json
|
5 |
+
|
6 |
+
with open(input_path.name, 'r') as json_file:
|
7 |
+
json_list = list(json_file)
|
8 |
+
|
9 |
+
global_designed_chain_list = []
|
10 |
+
if chain_list != '':
|
11 |
+
global_designed_chain_list = [str(item) for item in chain_list.split()]
|
12 |
+
my_dict = {}
|
13 |
+
for json_str in json_list:
|
14 |
+
result = json.loads(json_str)
|
15 |
+
all_chain_list = [item[-1:] for item in list(result) if item[:9]=='seq_chain'] #['A','B', 'C',...]
|
16 |
+
if len(global_designed_chain_list) > 0:
|
17 |
+
designed_chain_list = global_designed_chain_list
|
18 |
+
else:
|
19 |
+
#manually specify, e.g.
|
20 |
+
designed_chain_list = ["A"]
|
21 |
+
fixed_chain_list = [letter for letter in all_chain_list if letter not in designed_chain_list] #fix/do not redesign these chains
|
22 |
+
my_dict[result['name']]= (designed_chain_list, fixed_chain_list)
|
23 |
+
|
24 |
+
with open(output_path, 'w') as f:
|
25 |
+
f.write(json.dumps(my_dict) + '\n')
|
26 |
+
return output_path
|
27 |
+
|
28 |
+
|
29 |
+
# if __name__ == "__main__":
|
30 |
+
# argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
31 |
+
# argparser.add_argument("--input_path", type=str, help="Path to the parsed PDBs")
|
32 |
+
# argparser.add_argument("--output_path", type=str, help="Path to the output dictionary")
|
33 |
+
# argparser.add_argument("--chain_list", type=str, default='', help="List of the chains that need to be designed")
|
34 |
+
#
|
35 |
+
# args = argparser.parse_args()
|
36 |
+
# main(args)
|
37 |
+
|
38 |
+
# Output looks like this:
|
39 |
+
# {"5TTA": [["A"], ["B"]], "3LIS": [["A"], ["B"]]}
|
40 |
+
|
ProteinMPNN-main/helper_scripts/make_bias_AA.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
def m_b_A(output_path, AA_list, bias_list):
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import json
|
7 |
+
|
8 |
+
bias_list = [float(item) for item in bias_list.split()]
|
9 |
+
AA_list = [str(item) for item in AA_list.split()]
|
10 |
+
|
11 |
+
my_dict = dict(zip(AA_list, bias_list))
|
12 |
+
|
13 |
+
with open(output_path, 'w') as f:
|
14 |
+
f.write(json.dumps(my_dict) + '\n')
|
15 |
+
return output_path
|
16 |
+
|
17 |
+
|
18 |
+
# if __name__ == "__main__":
|
19 |
+
# argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
20 |
+
# argparser.add_argument("--output_path", type=str, help="Path to the output dictionary")
|
21 |
+
# argparser.add_argument("--AA_list", type=str, default='', help="List of AAs to be biased")
|
22 |
+
# argparser.add_argument("--bias_list", type=str, default='', help="AA bias strengths")
|
23 |
+
#
|
24 |
+
# args = argparser.parse_args()
|
25 |
+
# main(args)
|
26 |
+
|
27 |
+
#e.g. output
|
28 |
+
#{"A": -0.01, "G": 0.02}
|
ProteinMPNN-main/helper_scripts/make_fixed_positions_dict.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
def m_f_p_d(input_path, output_path, chain_list, position_list, specify_non_fixed):
|
4 |
+
import glob
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
import json
|
8 |
+
import itertools
|
9 |
+
|
10 |
+
with open(input_path.name, 'r') as json_file:
|
11 |
+
json_list = list(json_file)
|
12 |
+
|
13 |
+
fixed_list = [[int(item) for item in one.split()] for one in position_list.split(",")]
|
14 |
+
global_designed_chain_list = [str(item) for item in chain_list.split()]
|
15 |
+
my_dict = {}
|
16 |
+
|
17 |
+
if not specify_non_fixed:
|
18 |
+
for json_str in json_list:
|
19 |
+
result = json.loads(json_str)
|
20 |
+
all_chain_list = [item[-1:] for item in list(result) if item[:9]=='seq_chain']
|
21 |
+
fixed_position_dict = {}
|
22 |
+
for i, chain in enumerate(global_designed_chain_list):
|
23 |
+
fixed_position_dict[chain] = fixed_list[i]
|
24 |
+
for chain in all_chain_list:
|
25 |
+
if chain not in global_designed_chain_list:
|
26 |
+
fixed_position_dict[chain] = []
|
27 |
+
my_dict[result['name']] = fixed_position_dict
|
28 |
+
else:
|
29 |
+
for json_str in json_list:
|
30 |
+
result = json.loads(json_str)
|
31 |
+
all_chain_list = [item[-1:] for item in list(result) if item[:9]=='seq_chain']
|
32 |
+
fixed_position_dict = {}
|
33 |
+
for chain in all_chain_list:
|
34 |
+
seq_length = len(result[f'seq_chain_{chain}'])
|
35 |
+
all_residue_list = (np.arange(seq_length)+1).tolist()
|
36 |
+
if chain not in global_designed_chain_list:
|
37 |
+
fixed_position_dict[chain] = all_residue_list
|
38 |
+
else:
|
39 |
+
idx = np.argwhere(np.array(global_designed_chain_list) == chain)[0][0]
|
40 |
+
fixed_position_dict[chain] = list(set(all_residue_list)-set(fixed_list[idx]))
|
41 |
+
my_dict[result['name']] = fixed_position_dict
|
42 |
+
|
43 |
+
with open(output_path, 'w') as f:
|
44 |
+
f.write(json.dumps(my_dict) + '\n')
|
45 |
+
return output_path
|
46 |
+
|
47 |
+
#e.g. output
|
48 |
+
#{"5TTA": {"A": [1, 2, 3, 7, 8, 9, 22, 25, 33], "B": []}, "3LIS": {"A": [], "B": []}}
|
49 |
+
|
50 |
+
# if __name__ == "__main__":
|
51 |
+
# argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
52 |
+
# argparser.add_argument("--input_path", type=str, help="Path to the parsed PDBs")
|
53 |
+
# argparser.add_argument("--output_path", type=str, help="Path to the output dictionary")
|
54 |
+
# argparser.add_argument("--chain_list", type=str, default='', help="List of the chains that need to be fixed")
|
55 |
+
# argparser.add_argument("--position_list", type=str, default='', help="Position lists, e.g. 11 12 14 18, 1 2 3 4 for first chain and the second chain")
|
56 |
+
# argparser.add_argument("--specify_non_fixed", action="store_true", default=False, help="Allows specifying just residues that need to be designed (default: false)")
|
57 |
+
#
|
58 |
+
# args = argparser.parse_args()
|
59 |
+
# main(args)
|
60 |
+
|
ProteinMPNN-main/helper_scripts/make_tied_positions_dict.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
def m_t_p_d(input_path, output_path, chain_list, position_list, homooligomer):
|
4 |
+
|
5 |
+
import glob
|
6 |
+
import random
|
7 |
+
import numpy as np
|
8 |
+
import json
|
9 |
+
import itertools
|
10 |
+
|
11 |
+
with open(input_path.name, 'r') as json_file:
|
12 |
+
json_list = list(json_file)
|
13 |
+
|
14 |
+
homooligomeric_state = int(homooligomer)
|
15 |
+
|
16 |
+
if homooligomeric_state == 0:
|
17 |
+
tied_list = [[int(item) for item in one.split()] for one in position_list.split(",")]
|
18 |
+
global_designed_chain_list = [str(item) for item in chain_list.split()]
|
19 |
+
my_dict = {}
|
20 |
+
for json_str in json_list:
|
21 |
+
result = json.loads(json_str)
|
22 |
+
all_chain_list = sorted([item[-1:] for item in list(result) if item[:9]=='seq_chain']) #A, B, C, ...
|
23 |
+
tied_positions_list = []
|
24 |
+
for i, pos in enumerate(tied_list[0]):
|
25 |
+
temp_dict = {}
|
26 |
+
for j, chain in enumerate(global_designed_chain_list):
|
27 |
+
temp_dict[chain] = [tied_list[j][i]] #needs to be a list
|
28 |
+
tied_positions_list.append(temp_dict)
|
29 |
+
my_dict[result['name']] = tied_positions_list
|
30 |
+
else:
|
31 |
+
my_dict = {}
|
32 |
+
for json_str in json_list:
|
33 |
+
result = json.loads(json_str)
|
34 |
+
all_chain_list = sorted([item[-1:] for item in list(result) if item[:9]=='seq_chain']) #A, B, C, ...
|
35 |
+
tied_positions_list = []
|
36 |
+
chain_length = len(result[f"seq_chain_{all_chain_list[0]}"])
|
37 |
+
for i in range(1,chain_length+1):
|
38 |
+
temp_dict = {}
|
39 |
+
for j, chain in enumerate(all_chain_list):
|
40 |
+
temp_dict[chain] = [i] #needs to be a list
|
41 |
+
tied_positions_list.append(temp_dict)
|
42 |
+
my_dict[result['name']] = tied_positions_list
|
43 |
+
|
44 |
+
with open(output_path, 'w') as f:
|
45 |
+
f.write(json.dumps(my_dict) + '\n')
|
46 |
+
return output_path
|
47 |
+
# if __name__ == "__main__":
|
48 |
+
# argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
49 |
+
# argparser.add_argument("--input_path", type=str, help="Path to the parsed PDBs")
|
50 |
+
# argparser.add_argument("--output_path", type=str, help="Path to the output dictionary")
|
51 |
+
# argparser.add_argument("--chain_list", type=str, default='', help="List of the chains that need to be fixed")
|
52 |
+
# argparser.add_argument("--position_list", type=str, default='', help="Position lists, e.g. 11 12 14 18, 1 2 3 4 for first chain and the second chain")
|
53 |
+
# argparser.add_argument("--homooligomer", type=int, default=0, help="If 0 do not use, if 1 then design homooligomer")
|
54 |
+
#
|
55 |
+
# args = argparser.parse_args()
|
56 |
+
# main(args)
|
57 |
+
|
58 |
+
|
59 |
+
#e.g. output
|
60 |
+
#{"5TTA": [], "3LIS": [{"A": [1], "B": [1]}, {"A": [2], "B": [2]}, {"A": [3], "B": [3]}, {"A": [4], "B": [4]}, {"A": [5], "B": [5]}, {"A": [6], "B": [6]}, {"A": [7], "B": [7]}, {"A": [8], "B": [8]}, {"A": [9], "B": [9]}, {"A": [10], "B": [10]}, {"A": [11], "B": [11]}, {"A": [12], "B": [12]}, {"A": [13], "B": [13]}, {"A": [14], "B": [14]}, {"A": [15], "B": [15]}, {"A": [16], "B": [16]}, {"A": [17], "B": [17]}, {"A": [18], "B": [18]}, {"A": [19], "B": [19]}, {"A": [20], "B": [20]}, {"A": [21], "B": [21]}, {"A": [22], "B": [22]}, {"A": [23], "B": [23]}, {"A": [24], "B": [24]}, {"A": [25], "B": [25]}, {"A": [26], "B": [26]}, {"A": [27], "B": [27]}, {"A": [28], "B": [28]}, {"A": [29], "B": [29]}, {"A": [30], "B": [30]}, {"A": [31], "B": [31]}, {"A": [32], "B": [32]}, {"A": [33], "B": [33]}, {"A": [34], "B": [34]}, {"A": [35], "B": [35]}, {"A": [36], "B": [36]}, {"A": [37], "B": [37]}, {"A": [38], "B": [38]}, {"A": [39], "B": [39]}, {"A": [40], "B": [40]}, {"A": [41], "B": [41]}, {"A": [42], "B": [42]}, {"A": [43], "B": [43]}, {"A": [44], "B": [44]}, {"A": [45], "B": [45]}, {"A": [46], "B": [46]}, {"A": [47], "B": [47]}, {"A": [48], "B": [48]}, {"A": [49], "B": [49]}, {"A": [50], "B": [50]}, {"A": [51], "B": [51]}, {"A": [52], "B": [52]}, {"A": [53], "B": [53]}, {"A": [54], "B": [54]}, {"A": [55], "B": [55]}, {"A": [56], "B": [56]}, {"A": [57], "B": [57]}, {"A": [58], "B": [58]}, {"A": [59], "B": [59]}, {"A": [60], "B": [60]}, {"A": [61], "B": [61]}, {"A": [62], "B": [62]}, {"A": [63], "B": [63]}, {"A": [64], "B": [64]}, {"A": [65], "B": [65]}, {"A": [66], "B": [66]}, {"A": [67], "B": [67]}, {"A": [68], "B": [68]}, {"A": [69], "B": [69]}, {"A": [70], "B": [70]}, {"A": [71], "B": [71]}, {"A": [72], "B": [72]}, {"A": [73], "B": [73]}, {"A": [74], "B": [74]}, {"A": [75], "B": [75]}, {"A": [76], "B": [76]}, {"A": [77], "B": [77]}, {"A": [78], "B": [78]}, {"A": [79], "B": [79]}, {"A": [80], "B": [80]}, {"A": [81], "B": [81]}, {"A": [82], "B": [82]}, {"A": [83], "B": [83]}, {"A": [84], "B": [84]}, {"A": [85], "B": [85]}, {"A": [86], "B": [86]}, {"A": [87], "B": [87]}, {"A": [88], "B": [88]}, {"A": [89], "B": [89]}, {"A": [90], "B": [90]}, {"A": [91], "B": [91]}, {"A": [92], "B": [92]}, {"A": [93], "B": [93]}, {"A": [94], "B": [94]}, {"A": [95], "B": [95]}, {"A": [96], "B": [96]}]}
|
61 |
+
|
ProteinMPNN-main/helper_scripts/parse_multiple_chains.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
def p_m_c(input_path, output_path, ca_only):
|
5 |
+
import numpy as np
|
6 |
+
import os, time, gzip, json
|
7 |
+
import glob
|
8 |
+
|
9 |
+
# folder_with_pdbs_path = input_path
|
10 |
+
save_path = output_path
|
11 |
+
# ca_only = args.ca_only
|
12 |
+
# ca_only = False
|
13 |
+
|
14 |
+
alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
|
15 |
+
states = len(alpha_1)
|
16 |
+
alpha_3 = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
|
17 |
+
'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'GAP']
|
18 |
+
|
19 |
+
aa_1_N = {a: n for n, a in enumerate(alpha_1)}
|
20 |
+
aa_3_N = {a: n for n, a in enumerate(alpha_3)}
|
21 |
+
aa_N_1 = {n: a for n, a in enumerate(alpha_1)}
|
22 |
+
aa_1_3 = {a: b for a, b in zip(alpha_1, alpha_3)}
|
23 |
+
aa_3_1 = {b: a for a, b in zip(alpha_1, alpha_3)}
|
24 |
+
|
25 |
+
def AA_to_N(x):
|
26 |
+
# ["ARND"] -> [[0,1,2,3]]
|
27 |
+
x = np.array(x);
|
28 |
+
if x.ndim == 0: x = x[None]
|
29 |
+
return [[aa_1_N.get(a, states - 1) for a in y] for y in x]
|
30 |
+
|
31 |
+
def N_to_AA(x):
|
32 |
+
# [[0,1,2,3]] -> ["ARND"]
|
33 |
+
x = np.array(x);
|
34 |
+
if x.ndim == 1: x = x[None]
|
35 |
+
return ["".join([aa_N_1.get(a, "-") for a in y]) for y in x]
|
36 |
+
|
37 |
+
def parse_PDB_biounits(x, atoms=['N', 'CA', 'C'], chain=None):
|
38 |
+
'''
|
39 |
+
input: x = PDB filename
|
40 |
+
atoms = atoms to extract (optional)
|
41 |
+
output: (length, atoms, coords=(x,y,z)), sequence
|
42 |
+
'''
|
43 |
+
xyz, seq, min_resn, max_resn = {}, {}, 1e6, -1e6
|
44 |
+
for line in open(x, "rb"):
|
45 |
+
line = line.decode("utf-8", "ignore").rstrip()
|
46 |
+
|
47 |
+
if line[:6] == "HETATM" and line[17:17 + 3] == "MSE":
|
48 |
+
line = line.replace("HETATM", "ATOM ")
|
49 |
+
line = line.replace("MSE", "MET")
|
50 |
+
|
51 |
+
if line[:4] == "ATOM":
|
52 |
+
ch = line[21:22]
|
53 |
+
if ch == chain or chain is None:
|
54 |
+
atom = line[12:12 + 4].strip()
|
55 |
+
resi = line[17:17 + 3]
|
56 |
+
resn = line[22:22 + 5].strip()
|
57 |
+
x, y, z = [float(line[i:(i + 8)]) for i in [30, 38, 46]]
|
58 |
+
|
59 |
+
if resn[-1].isalpha():
|
60 |
+
resa, resn = resn[-1], int(resn[:-1]) - 1
|
61 |
+
else:
|
62 |
+
resa, resn = "", int(resn) - 1
|
63 |
+
# resn = int(resn)
|
64 |
+
if resn < min_resn:
|
65 |
+
min_resn = resn
|
66 |
+
if resn > max_resn:
|
67 |
+
max_resn = resn
|
68 |
+
if resn not in xyz:
|
69 |
+
xyz[resn] = {}
|
70 |
+
if resa not in xyz[resn]:
|
71 |
+
xyz[resn][resa] = {}
|
72 |
+
if resn not in seq:
|
73 |
+
seq[resn] = {}
|
74 |
+
if resa not in seq[resn]:
|
75 |
+
seq[resn][resa] = resi
|
76 |
+
|
77 |
+
if atom not in xyz[resn][resa]:
|
78 |
+
xyz[resn][resa][atom] = np.array([x, y, z])
|
79 |
+
|
80 |
+
# convert to numpy arrays, fill in missing values
|
81 |
+
seq_, xyz_ = [], []
|
82 |
+
try:
|
83 |
+
for resn in range(min_resn, max_resn + 1):
|
84 |
+
if resn in seq:
|
85 |
+
for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k], 20))
|
86 |
+
else:
|
87 |
+
seq_.append(20)
|
88 |
+
if resn in xyz:
|
89 |
+
for k in sorted(xyz[resn]):
|
90 |
+
for atom in atoms:
|
91 |
+
if atom in xyz[resn][k]:
|
92 |
+
xyz_.append(xyz[resn][k][atom])
|
93 |
+
else:
|
94 |
+
xyz_.append(np.full(3, np.nan))
|
95 |
+
else:
|
96 |
+
for atom in atoms: xyz_.append(np.full(3, np.nan))
|
97 |
+
return np.array(xyz_).reshape(-1, len(atoms), 3), N_to_AA(np.array(seq_))
|
98 |
+
except TypeError:
|
99 |
+
return 'no_chain', 'no_chain'
|
100 |
+
|
101 |
+
pdb_dict_list = []
|
102 |
+
c = 0
|
103 |
+
|
104 |
+
# if folder_with_pdbs_path[-1] != '/':
|
105 |
+
# folder_with_pdbs_path = folder_with_pdbs_path + '/'
|
106 |
+
|
107 |
+
init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
|
108 |
+
'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
|
109 |
+
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
|
110 |
+
extra_alphabet = [str(item) for item in list(np.arange(300))]
|
111 |
+
chain_alphabet = init_alphabet + extra_alphabet
|
112 |
+
|
113 |
+
# biounit_names = glob.glob(folder_with_pdbs_path + '*.pdb')
|
114 |
+
for input in input_path:
|
115 |
+
my_dict = {}
|
116 |
+
s = 0
|
117 |
+
concat_seq = ''
|
118 |
+
concat_N = []
|
119 |
+
concat_CA = []
|
120 |
+
concat_C = []
|
121 |
+
concat_O = []
|
122 |
+
concat_mask = []
|
123 |
+
coords_dict = {}
|
124 |
+
for letter in chain_alphabet:
|
125 |
+
if ca_only:
|
126 |
+
sidechain_atoms = ['CA']
|
127 |
+
else:
|
128 |
+
sidechain_atoms = ['N', 'CA', 'C', 'O']
|
129 |
+
xyz, seq = parse_PDB_biounits(input.name, atoms=sidechain_atoms, chain=letter)
|
130 |
+
if type(xyz) != str:
|
131 |
+
concat_seq += seq[0]
|
132 |
+
my_dict['seq_chain_' + letter] = seq[0]
|
133 |
+
coords_dict_chain = {}
|
134 |
+
if ca_only:
|
135 |
+
coords_dict_chain['CA_chain_' + letter] = xyz.tolist()
|
136 |
+
else:
|
137 |
+
coords_dict_chain['N_chain_' + letter] = xyz[:, 0, :].tolist()
|
138 |
+
coords_dict_chain['CA_chain_' + letter] = xyz[:, 1, :].tolist()
|
139 |
+
coords_dict_chain['C_chain_' + letter] = xyz[:, 2, :].tolist()
|
140 |
+
coords_dict_chain['O_chain_' + letter] = xyz[:, 3, :].tolist()
|
141 |
+
my_dict['coords_chain_' + letter] = coords_dict_chain
|
142 |
+
s += 1
|
143 |
+
na = input.name
|
144 |
+
fi = na.rfind("\\")
|
145 |
+
my_dict['name'] = na[(fi + 1):(fi+5)]
|
146 |
+
my_dict['num_of_chains'] = s
|
147 |
+
my_dict['seq'] = concat_seq
|
148 |
+
if s < len(chain_alphabet):
|
149 |
+
pdb_dict_list.append(my_dict)
|
150 |
+
c += 1
|
151 |
+
|
152 |
+
with open(save_path, 'w') as f:
|
153 |
+
for entry in pdb_dict_list:
|
154 |
+
f.write(json.dumps(entry) + '\n')
|
155 |
+
return save_path
|
156 |
+
|
157 |
+
|
158 |
+
# if __name__ == "__main__":
|
159 |
+
# argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
160 |
+
#
|
161 |
+
# argparser.add_argument("--input_path", type=str, help="Path to a folder with pdb files, e.g. /home/my_pdbs/")
|
162 |
+
# argparser.add_argument("--output_path", type=str, help="Path where to save .jsonl dictionary of parsed pdbs")
|
163 |
+
# argparser.add_argument("--ca_only", action="store_true", default=False,
|
164 |
+
# help="parse a backbone-only structure (default: false)")
|
165 |
+
#
|
166 |
+
# args = argparser.parse_args()
|
167 |
+
# main(args)
|