|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os, re, pysam, sys, math, argparse |
|
from RUST.methods import * |
|
|
|
try: |
|
import matplotlib as mpl |
|
|
|
mpl.use("Agg") |
|
import matplotlib.pyplot as plt |
|
from pylab import MaxNLocator |
|
except: |
|
pass |
|
|
|
|
|
def RUST_metagene_plot(infileopen36, ax36): |
|
infileopen36.seek(0) |
|
infileopen36.readline() |
|
while 1: |
|
line = infileopen36.readline() |
|
linesplit = line.split(",") |
|
if len(linesplit) == 1: |
|
break |
|
codon = linesplit[0] |
|
coverage = list(map(float, linesplit[1:])) |
|
coverage_a = coverage[0] |
|
if coverage_a == 0: |
|
continue |
|
coverage_n = [n / coverage_a for n in coverage[1:]] |
|
log2_values = [math.log(n, 2) if n != 0 else float("-inf") for n in coverage_n] |
|
ax36.plot(log2_values, color="gray") |
|
|
|
line = infileopen36.readline() |
|
linesplit = line.split(",") |
|
if "NA" not in line: |
|
coverage = map(float, linesplit[2:]) |
|
ax2 = ax36.twinx() |
|
ax2.plot(coverage, color="blue") |
|
for tl in ax2.get_yticklabels(): |
|
tl.set_color("blue") |
|
tl.set_rotation(0) |
|
|
|
ax2.yaxis.set_major_locator(MaxNLocator(3)) |
|
ax2.set_ylim(0, 1.0) |
|
ax2.set_ylim(-2, 1.0) |
|
ax2.set_yticks([0, 1], minor=False) |
|
ax2.set_yticklabels(["0", "1"]) |
|
ax2.set_ylabel("Kullback-Leibler divergence", color="blue") |
|
|
|
ax36.set_xticks([5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55]) |
|
ax36.set_xticklabels([-35, -30, -25, -20, -15, -10, -5, 0, 5, 10, 15]) |
|
ax36.set_xlabel("distance from A-site [codon]") |
|
ax36.set_ylabel("Codon RUST ratio (observed/expected), log2") |
|
ax36.axvline(40, color="red") |
|
|
|
|
|
def A_site_plot(infileopen35, dict_codon75, axis_Asite53, loc2): |
|
codon_to_amino_dict = {} |
|
amino_to_codons_dict = {} |
|
for amino_acid, codons in dict_codon75.items(): |
|
for codon in codons: |
|
codon_to_amino_dict[codon] = amino_acid |
|
amino_to_codons_dict.setdefault(amino_acid, []).append(codon) |
|
|
|
list1 = [] |
|
list2 = [] |
|
infileopen35.seek(0) |
|
infileopen35.readline() |
|
dict_amino_value = {} |
|
for line in infileopen35: |
|
linesplit = line.split(",") |
|
if len(linesplit) == 1: |
|
break |
|
codon = linesplit[0] |
|
if codon in ["TAA", "TAG", "TGA"]: |
|
continue |
|
list1.append(linesplit[0]) |
|
coverage = list(map(float, linesplit[1:])) |
|
coverage_a = coverage[0] |
|
coverage_n = [n / coverage_a for n in coverage[1:]] |
|
list2.append(float(coverage_n[loc2])) |
|
|
|
amino = codon_to_amino_dict[linesplit[0]] |
|
if amino in dict_amino_value: |
|
dict_amino_value[codon_to_amino_dict[linesplit[0]]].append( |
|
float(coverage_n[loc2]) |
|
) |
|
else: |
|
dict_amino_value[codon_to_amino_dict[linesplit[0]]] = [ |
|
float(coverage_n[loc2]) |
|
] |
|
|
|
list_amino_sorted = [] |
|
for key, value in dict_amino_value.items(): |
|
list_amino_sorted.append((mean_value(value), key)) |
|
list_amino_sorted.sort() |
|
|
|
A_site_value_norm = [n / min(list2) for n in list2] |
|
list3 = list(zip(A_site_value_norm, list1)) |
|
list3.sort() |
|
A_site_value_norm_dict = {} |
|
for tupel in list3: |
|
A_site_value_norm_dict[tupel[1]] = tupel[0] |
|
|
|
used_codons = [] |
|
xloc = [] |
|
xtick_label = [] |
|
n1 = 0 |
|
|
|
for _, amino_acid in list_amino_sorted: |
|
if amino_acid in used_codons: |
|
continue |
|
used_codons.append(amino_acid) |
|
n1 += 1 |
|
|
|
xloc.append(n1) |
|
for amino_acid_codon in amino_to_codons_dict[amino_acid]: |
|
axis_Asite53.scatter( |
|
n1, |
|
A_site_value_norm_dict[amino_acid_codon], |
|
color="gray", |
|
s=50, |
|
edgecolor="gray", |
|
) |
|
xtick_label.append(amino_acid) |
|
|
|
axis_Asite53.set_xticks(xloc) |
|
axis_Asite53.set_xticklabels(xtick_label, rotation=90) |
|
for tick in axis_Asite53.get_xticklabels(): |
|
if tick.get_text() in ["Phe", "Tyr", "Trp"]: |
|
a2 = tick.set_backgroundcolor("lightgreen") |
|
|
|
if tick.get_text() in ["Val", "Ala", "Leu", "Met", "Ile"]: |
|
tick.set_backgroundcolor("lightgrey") |
|
|
|
if tick.get_text() in ["Ser", "Asn", "Thr", "Gln"]: |
|
tick.set_backgroundcolor("ForestGreen") |
|
tick.set_color("white") |
|
|
|
if tick.get_text() in ["His", "Lys", "Arg"]: |
|
tick.set_backgroundcolor("blue") |
|
tick.set_color("white") |
|
if tick.get_text() in ["Glu", "Asp"]: |
|
tick.set_backgroundcolor("red") |
|
tick.set_color("white") |
|
axis_Asite53.set_xlim(0, n1 + 1) |
|
axis_Asite53.set_ylabel("A-site codon RUST ratio") |
|
|
|
red = mpl.patches.Rectangle((0, 0), 1, 1, fc="r") |
|
blue = mpl.patches.Rectangle((0, 0), 1, 1, fc="b") |
|
fgreen = mpl.patches.Rectangle((0, 0), 1, 1, fc="ForestGreen") |
|
lgreen = mpl.patches.Rectangle((0, 0), 1, 1, fc="lightGreen") |
|
grey = mpl.patches.Rectangle((0, 0), 1, 1, fc="lightgrey") |
|
|
|
axis_Asite53.legend( |
|
[red, grey, lgreen, blue, fgreen], |
|
["acidic", "aliphatic", "aromatic", "basic", "polar\nuncharged"], |
|
bbox_to_anchor=(0, 0, 0.8, 1.12), |
|
ncol=3, |
|
) |
|
|
|
|
|
def main(args): |
|
|
|
universal_code = { |
|
"Ala": ["GCT", "GCC", "GCG", "GCA"], |
|
"Gly": ["GGT", "GGC", "GGG", "GGA"], |
|
"Pro": ["CCT", "CCC", "CCG", "CCA"], |
|
"Thr": ["ACT", "ACC", "ACG", "ACA"], |
|
"Val": ["GTT", "GTC", "GTG", "GTA"], |
|
"Ser": ["TCT", "TCC", "TCG", "TCA", "AGT", "AGC"], |
|
"Arg": ["CGT", "CGC", "CGG", "CGA", "AGG", "AGA"], |
|
"Leu": ["CTT", "CTC", "CTG", "CTA", "TTG", "TTA"], |
|
"Phe": ["TTT", "TTC"], |
|
"Asn": ["AAT", "AAC"], |
|
"Lys": ["AAG", "AAA"], |
|
"Asp": ["GAT", "GAC"], |
|
"Glu": ["GAG", "GAA"], |
|
"His": ["CAT", "CAC"], |
|
"Gln": ["CAG", "CAA"], |
|
"Ile": ["ATT", "ATC", "ATA"], |
|
"Met": ["ATG"], |
|
"Tyr": ["TAT", "TAC"], |
|
"Cys": ["TGT", "TGC"], |
|
"Trp": ["TGG"], |
|
"Stop": ["TGA", "TAG", "TAA"], |
|
} |
|
|
|
mRNA_sequences = args.transcriptome |
|
in_seq_handle = open(mRNA_sequences) |
|
cds_start_dict = {} |
|
cds_end_dict = {} |
|
seq_dict = {} |
|
for line in in_seq_handle: |
|
if line[0] != ">": |
|
seq_dict.setdefault(transcript, "") |
|
seq_dict[transcript] += line[:-1] |
|
continue |
|
try: |
|
transcript_split = line[:-1].split("\t") |
|
transcript = transcript_split[0][1:] |
|
cds_start_dict[transcript] = int(transcript_split[1]) |
|
cds_end_dict[transcript] = int(transcript_split[2]) |
|
except: |
|
pass |
|
in_seq_handle.close() |
|
|
|
offset = args.offset |
|
readlen_range = args.lengths |
|
|
|
readlen_rangesplit = readlen_range.split(":") |
|
if len(readlen_rangesplit) == 1: |
|
accepted_read_lengths = [int(readlen_rangesplit[0])] |
|
length_values = "%s" % int(readlen_rangesplit[0]) |
|
elif len(readlen_rangesplit) == 2: |
|
accepted_read_lengths = [ |
|
readlen |
|
for readlen in range( |
|
int(readlen_rangesplit[0]), int(readlen_rangesplit[1]) + 1 |
|
) |
|
] |
|
length_values = "%s_%s" % ( |
|
int(readlen_rangesplit[0]), |
|
int(readlen_rangesplit[1]), |
|
) |
|
else: |
|
stop_err( |
|
"Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)" |
|
) |
|
if len(accepted_read_lengths) == 0: |
|
stop_err( |
|
"Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)" |
|
) |
|
|
|
nts = ["A", "G", "C", "T"] |
|
aligments_A1 = pysam.Samfile( |
|
args.alignment, "rb" |
|
) |
|
|
|
codon_enrichment_dict = {} |
|
codon_enrichment_expected_dict = {} |
|
for nt in nts: |
|
for nt2 in nts: |
|
for nt3 in nts: |
|
codon = "%s%s%s" % (nt, nt2, nt3) |
|
codon_enrichment_dict[codon] = {} |
|
codon_enrichment_expected_dict[codon] = [] |
|
for number in range(0, 60, 1): |
|
codon_enrichment_dict[codon][number] = [0.0, 0.0] |
|
|
|
list_transcripts = seq_dict.keys() |
|
number_transcripts = 0 |
|
list_10_percentile = [] |
|
for value in range(1, 10): |
|
list_10_percentile.append((len(list_transcripts) * value) / 10) |
|
for transcript in list_transcripts: |
|
number_transcripts += 1 |
|
if number_transcripts in list_10_percentile: |
|
sys.stdout.write( |
|
"%s percent\n" |
|
% ((list_10_percentile.index(number_transcripts) + 1) * 10) |
|
) |
|
|
|
try: |
|
cds_start = cds_start_dict[transcript] |
|
cds_end = cds_end_dict[transcript] |
|
if cds_end < cds_start: |
|
raise Exception |
|
except Exception: |
|
transcript_seq = seq_dict[transcript] |
|
cds_start = -1 |
|
start_post = [] |
|
end_post = [] |
|
for match in re.finditer(r"(?=(%s))" % re.escape("ATG"), transcript_seq): |
|
start_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TAG"), transcript_seq): |
|
end_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TAA"), transcript_seq): |
|
end_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TGA"), transcript_seq): |
|
end_post.append(match.start()) |
|
|
|
end_post.sort() |
|
len_max_orf = 0 |
|
for value in start_post: |
|
for value2 in end_post: |
|
if value < value2: |
|
if value % 3 == value2 % 3: |
|
len_orf = value2 - value |
|
if len_orf > len_max_orf: |
|
cds_start = value |
|
cds_end = value2 + 3 |
|
len_max_orf = len_orf |
|
break |
|
if cds_start == -1: |
|
|
|
continue |
|
|
|
elongation_region_all = seq_dict[transcript][cds_start:cds_end] |
|
elongation_region_part = elongation_region_all[ |
|
120:-60 |
|
] |
|
|
|
|
|
if len(elongation_region_part) % 3 != 0: |
|
|
|
continue |
|
|
|
profile_list = [ |
|
0.0 for n in range(cds_start + 120, cds_end - 60) |
|
] |
|
if len(profile_list) < 50: |
|
|
|
continue |
|
all_reads = aligments_A1.fetch(transcript) |
|
|
|
len_elongation_region = len(profile_list) |
|
for read in all_reads: |
|
readlen = read.qlen |
|
if readlen not in accepted_read_lengths: |
|
continue |
|
A_site = read.pos + offset - cds_start - 120 |
|
if len_elongation_region > A_site > -1: |
|
profile_list[A_site] += 1 |
|
|
|
average_gene_density = float(sum(profile_list)) / len( |
|
profile_list |
|
) |
|
if average_gene_density != 0: |
|
num_codon = len( |
|
[ |
|
1 |
|
for number88 in range(0, len(profile_list), 3) |
|
if ( |
|
( |
|
profile_list[number88] |
|
+ profile_list[number88 + 1] |
|
+ profile_list[number88 + 2] |
|
) |
|
/ 3 |
|
) |
|
> average_gene_density |
|
] |
|
) |
|
|
|
expected_codon_density = float(num_codon) / ( |
|
len(profile_list) / 3 |
|
) |
|
|
|
codon_start = 0 |
|
for sliding_w_n in range( |
|
0, len(elongation_region_part), 3 |
|
): |
|
codon_window = str( |
|
elongation_region_all[codon_start : codon_start + 180] |
|
) |
|
if len(set(codon_window) - set(["A", "T", "G", "C"])) != 0: |
|
codon_start += 3 |
|
continue |
|
|
|
if ( |
|
profile_list[sliding_w_n] |
|
+ profile_list[sliding_w_n + 1] |
|
+ profile_list[sliding_w_n + 2] |
|
) / 3 > average_gene_density: |
|
for number in range(0, 60): |
|
codon = codon_window[number * 3 : (number + 1) * 3] |
|
codon_enrichment_dict[codon][number][0] += 1 |
|
codon_enrichment_dict[codon][number][1] += 1 |
|
else: |
|
for number in range(0, 60): |
|
codon = codon_window[number * 3 : (number + 1) * 3] |
|
codon_enrichment_dict[codon][number][0] += 1 |
|
|
|
codon = codon_window[120:123] |
|
codon_enrichment_expected_dict[codon].append(expected_codon_density) |
|
codon_start += 3 |
|
|
|
if not os.path.exists(args.Path): |
|
os.mkdir(args.Path) |
|
alignment_filename = args.alignment.split("/")[-1] |
|
outfile = open( |
|
"%s/RUST_codon_file_%s_%s_%s" |
|
% (args.Path, alignment_filename, args.offset, length_values), |
|
"w", |
|
) |
|
outfile.write("codon, expected value") |
|
for number106 in range(-40, 20): |
|
outfile.write(", %s" % number106) |
|
outfile.write("\n") |
|
|
|
list_codons = [] |
|
codons = list(codon_enrichment_dict) |
|
codons.sort() |
|
rust_expected = [] |
|
rust_observed_metafootprint = [] |
|
for codon in codons: |
|
if codon in list_codons: |
|
continue |
|
if codon in ["TAA", "TGA", "TAG"]: |
|
continue |
|
list_codons.append(codon) |
|
outfile.write("%s" % codon) |
|
if codon_enrichment_expected_dict[codon] != []: |
|
outfile.write(", %s" % mean_value(codon_enrichment_expected_dict[codon])) |
|
list_data = [] |
|
for number in range(0, 60): |
|
if codon_enrichment_dict[codon][number][0] != 0: |
|
outfile.write( |
|
", %s" |
|
% ( |
|
codon_enrichment_dict[codon][number][1] |
|
/ codon_enrichment_dict[codon][number][0] |
|
) |
|
) |
|
list_data.append( |
|
codon_enrichment_dict[codon][number][1] |
|
/ codon_enrichment_dict[codon][number][0] |
|
) |
|
else: |
|
outfile.write(", 0") |
|
list_data.append(0) |
|
outfile.write("\n") |
|
rust_expected.append(mean_value(codon_enrichment_expected_dict[codon])) |
|
rust_observed_metafootprint.append(list_data) |
|
|
|
rust_expected_sum = sum(rust_expected) |
|
q_values = [n / rust_expected_sum for n in rust_expected] |
|
|
|
shannon_values = [] |
|
for loc_i in range(60): |
|
rust_observed = [n[loc_i] for n in rust_observed_metafootprint] |
|
rust_observed_sum = sum(rust_observed) |
|
rust_observed_min = min(rust_observed) |
|
if rust_observed_min == 0: |
|
shannon_values.append("NA") |
|
else: |
|
p_values = [n / rust_observed_sum for n in rust_observed] |
|
shannon = [] |
|
list_normalised = [] |
|
for p_value, q_value in zip(p_values, q_values): |
|
shannon.append(abs(p_value * math.log((p_value / q_value), 2))) |
|
list_normalised.append(p_value / q_value) |
|
shannon_values.append(sum(shannon)) |
|
|
|
outfile.write("\nKullback Leibler divergence,") |
|
for value in shannon_values: |
|
outfile.write(", %s" % value) |
|
outfile.close() |
|
|
|
try: |
|
mpl.rcParams["xtick.direction"] = "out" |
|
mpl.rcParams["ytick.direction"] = "out" |
|
mpl.rcParams["legend.fontsize"] = 10 |
|
mpl.rcParams["ytick.labelsize"] = 10 |
|
mpl.rcParams["xtick.labelsize"] = 10 |
|
mpl.rcParams["font.size"] = 10 |
|
mpl.rcParams["axes.titlesize"] = 10 |
|
mpl.rcParams["legend.frameon"] = 0 |
|
mpl.rcParams["axes.axisbelow"] = False |
|
mpl.rcParams["xtick.major.pad"] = 2.0 |
|
mpl.rcParams["ytick.major.pad"] = 2 |
|
mpl.rcParams["xtick.major.size"] = 2.0 |
|
mpl.rcParams["ytick.major.size"] = 2 |
|
mpl.rcParams["axes.linewidth"] = 0.5 |
|
mpl.rcParams["ytick.major.width"] = 0.25 |
|
mpl.rcParams["xtick.major.width"] = 0.25 |
|
mpl.rcParams["lines.linewidth"] = 1 |
|
mpl.rcParams["legend.borderpad"] = 0.01 |
|
mpl.rcParams["legend.labelspacing"] = 0.05 |
|
mpl.rcParams["legend.columnspacing"] = 0.5 |
|
mpl.rcParams["legend.borderaxespad"] = 0.15 |
|
mpl.rcParams["legend.handlelength"] = 1 |
|
|
|
fig = plt.figure(figsize=(6.69, 6.0)) |
|
infileopen = open( |
|
"%s/RUST_codon_file_%s_%s_%s" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
ax1_metafootprint = fig.add_subplot(111) |
|
RUST_metagene_plot(infileopen, ax1_metafootprint) |
|
plt.savefig( |
|
"%s/RUST_codon_metafootprint_%s_%s_%s.png" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
plt.clf() |
|
|
|
infileopen = open( |
|
"%s/RUST_codon_file_%s_%s_%s" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
ax1codon_Asite = fig.add_subplot(111) |
|
A_site_plot(infileopen, universal_code, ax1codon_Asite, 40) |
|
plt.savefig( |
|
"%s/A_site_%s_%s_%s.png" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
|
|
except: |
|
sys.stdout.write("Error producing images\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser( |
|
description="Produces RUST metagene profile of codons" |
|
) |
|
parser.add_argument("--version", action="version", version="%(prog)s 1.2") |
|
|
|
parser.add_argument( |
|
"-t", |
|
"--transcriptome", |
|
help="fasta file of transcripts, CDS start and end may be provided on description line using tab separation e.g. >NM_0001 10 5000, otherwise it searches for longest ORF" |
|
", required=True", |
|
) |
|
parser.add_argument( |
|
"-a", |
|
"--alignment", |
|
help="sorted bam file of transcriptome alignments", |
|
required=True, |
|
) |
|
parser.add_argument("-o", "--offset", help="nucleotide offset to A-site", type=int) |
|
parser.add_argument( |
|
"-l", |
|
"--lengths", |
|
help="lengths of footprints included, for example 28:32 is 28,29,30,31,32", |
|
) |
|
parser.add_argument( |
|
"-P", "--Path", help='path to outputfile, default is "codon"', default="codon" |
|
) |
|
args = parser.parse_args(None) |
|
|
|
main(args) |
|
|