|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os, re, pysam, sys, math, argparse |
|
from RUST.methods import * |
|
|
|
try: |
|
import matplotlib as mpl |
|
|
|
mpl.use("Agg") |
|
import matplotlib.pyplot as plt |
|
from pylab import MaxNLocator |
|
except: |
|
pass |
|
|
|
|
|
def RUST_metagene_plot(infileopen36, ax36): |
|
infileopen36.seek(0) |
|
infileopen36.readline() |
|
log_style = 1 |
|
while 1: |
|
line = infileopen36.readline() |
|
linesplit = line.split(",") |
|
if len(linesplit) == 1: |
|
break |
|
nucleotide_type = linesplit[0] |
|
coverage = list(map(float, linesplit[1:])) |
|
coverage_a = coverage[0] |
|
if coverage_a == 0: |
|
continue |
|
coverage_n = [n / coverage_a for n in coverage[1:]] |
|
if min(coverage_n[:-1]) == 0: |
|
log_style = 0 |
|
continue |
|
|
|
infileopen36.seek(0) |
|
infileopen36.readline() |
|
while 1: |
|
line = infileopen36.readline() |
|
linesplit = line.split(",") |
|
if len(linesplit) == 1: |
|
break |
|
nucleotide_type = linesplit[0] |
|
coverage = list(map(float, linesplit[1:])) |
|
coverage_a = coverage[0] |
|
if coverage_a == 0: |
|
continue |
|
coverage_n = [n / coverage_a for n in coverage[1:]] |
|
if log_style: |
|
log2_values = [math.log(n, 2) for n in coverage_n[:-1]] |
|
ax36.plot(log2_values, color="gray") |
|
else: |
|
ax36.plot(coverage_n[:-1], color="gray") |
|
|
|
line = infileopen36.readline() |
|
linesplit = line.split(",") |
|
if "NA" not in line[:-3]: |
|
coverage = list(map(float, linesplit[2:-1])) |
|
ax2 = ax36.twinx() |
|
ax2.plot(coverage, color="blue") |
|
for tl in ax2.get_yticklabels(): |
|
tl.set_color("blue") |
|
tl.set_rotation(0) |
|
|
|
ax2.yaxis.set_major_locator(MaxNLocator(3)) |
|
ax2.set_ylim(0, 1.0) |
|
ax2.set_ylim(-2, 1.0) |
|
ax2.set_yticks([0, 1], minor=False) |
|
ax2.set_yticklabels(["0", "1"]) |
|
ax2.set_ylabel("Kullback-Leibler divergence", color="blue") |
|
|
|
ax36.set_xticks([5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55]) |
|
ax36.set_xticklabels([-35, -30, -25, -20, -15, -10, -5, 0, 5, 10, 15]) |
|
ax36.set_xlabel("distance from A-site [codon]") |
|
if log_style: |
|
ax36.set_ylabel("Dipeptide RUST ratio (observed/expected), log2") |
|
else: |
|
ax36.set_ylabel("Dipeptide RUST ratio (observed/expected)") |
|
ax36.axvline(40, color="red") |
|
|
|
|
|
|
|
def main(args): |
|
|
|
mRNA_sequences = args.transcriptome |
|
in_seq_handle = open(mRNA_sequences) |
|
cds_start_dict = {} |
|
cds_end_dict = {} |
|
seq_dict = {} |
|
for line in in_seq_handle: |
|
if line[0] != ">": |
|
seq_dict.setdefault(transcript, "") |
|
seq_dict[transcript] += line[:-1] |
|
continue |
|
try: |
|
transcript_split = line[:-1].split("\t") |
|
transcript = transcript_split[0][1:] |
|
cds_start_dict[transcript] = int(transcript_split[1]) |
|
cds_end_dict[transcript] = int(transcript_split[2]) |
|
except: |
|
pass |
|
in_seq_handle.close() |
|
|
|
offset = args.offset |
|
readlen_range = args.lengths |
|
readlen_rangesplit = readlen_range.split(":") |
|
if len(readlen_rangesplit) == 1: |
|
accepted_read_lengths = [int(readlen_rangesplit[0])] |
|
length_values = "%s" % int(readlen_rangesplit[0]) |
|
elif len(readlen_rangesplit) == 2: |
|
accepted_read_lengths = [ |
|
readlen |
|
for readlen in range( |
|
int(readlen_rangesplit[0]), int(readlen_rangesplit[1]) + 1 |
|
) |
|
] |
|
length_values = "%s_%s" % ( |
|
int(readlen_rangesplit[0]), |
|
int(readlen_rangesplit[1]), |
|
) |
|
else: |
|
stop_err( |
|
"Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)" |
|
) |
|
if len(accepted_read_lengths) == 0: |
|
stop_err( |
|
"Lengths of footprints parameter not in correct format, it should be either colon seperated with the second value greater or equal to the first, (28:32) or a single interger (31)" |
|
) |
|
|
|
amino_acids = [ |
|
"A", |
|
"C", |
|
"E", |
|
"D", |
|
"G", |
|
"F", |
|
"I", |
|
"H", |
|
"K", |
|
"M", |
|
"L", |
|
"N", |
|
"Q", |
|
"P", |
|
"S", |
|
"R", |
|
"T", |
|
"W", |
|
"V", |
|
"Y", |
|
] |
|
aligments_A1 = pysam.Samfile( |
|
args.alignment, "rb" |
|
) |
|
|
|
dipeptide_enrichment_dict = {} |
|
dipeptide_enrichment_expected_dict = {} |
|
for amino_acid in amino_acids: |
|
for amino_acid2 in amino_acids: |
|
|
|
dipeptide = "%s%s" % (amino_acid, amino_acid2) |
|
dipeptide_enrichment_dict[dipeptide] = {} |
|
dipeptide_enrichment_expected_dict[dipeptide] = [] |
|
for number in range(0, 60, 1): |
|
dipeptide_enrichment_dict[dipeptide][number] = [0.0, 0.0] |
|
|
|
list_transcripts = seq_dict.keys() |
|
number_transcripts = 0 |
|
list_10_percentile = [] |
|
for value in range(1, 10): |
|
list_10_percentile.append((len(list_transcripts) * value) / 10) |
|
for transcript in list_transcripts: |
|
number_transcripts += 1 |
|
if number_transcripts in list_10_percentile: |
|
sys.stdout.write( |
|
"%s percent\n" |
|
% ((list_10_percentile.index(number_transcripts) + 1) * 10) |
|
) |
|
|
|
try: |
|
cds_start = cds_start_dict[transcript] |
|
cds_end = cds_end_dict[transcript] |
|
if cds_end < cds_start: |
|
raise Exception |
|
except Exception: |
|
transcript_seq = seq_dict[transcript] |
|
cds_start = -1 |
|
start_post = [] |
|
end_post = [] |
|
for match in re.finditer(r"(?=(%s))" % re.escape("ATG"), transcript_seq): |
|
start_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TAG"), transcript_seq): |
|
end_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TAA"), transcript_seq): |
|
end_post.append(match.start()) |
|
for match in re.finditer(r"(?=(%s))" % re.escape("TGA"), transcript_seq): |
|
end_post.append(match.start()) |
|
|
|
end_post.sort() |
|
len_max_orf = 0 |
|
for value in start_post: |
|
for value2 in end_post: |
|
if value < value2: |
|
if value % 3 == value2 % 3: |
|
len_orf = value2 - value |
|
if len_orf > len_max_orf: |
|
cds_start = value |
|
cds_end = value2 + 3 |
|
len_max_orf = len_orf |
|
break |
|
if cds_start == -1: |
|
|
|
continue |
|
|
|
elongation_region_all = seq_dict[transcript][cds_start:cds_end] |
|
elongation_region_part = elongation_region_all[ |
|
120:-60 |
|
] |
|
if len(elongation_region_part) % 3 != 0: |
|
|
|
continue |
|
peptide_sequence = translate_dna(elongation_region_all) |
|
|
|
profile_list = [ |
|
0.0 for n in range(cds_start + 120, cds_end - 60) |
|
] |
|
if len(profile_list) < 50: |
|
|
|
continue |
|
all_reads = aligments_A1.fetch(transcript) |
|
|
|
len_elongation_region = len(profile_list) |
|
for read in all_reads: |
|
readlen = read.qlen |
|
if readlen not in accepted_read_lengths: |
|
continue |
|
A_site = read.pos + offset - cds_start - 120 |
|
if len_elongation_region > A_site > -1: |
|
profile_list[A_site] += 1 |
|
average_gene_density = float(sum(profile_list)) / len( |
|
profile_list |
|
) |
|
|
|
if average_gene_density != 0: |
|
num_codon = len( |
|
[ |
|
1 |
|
for number88 in range(0, len(profile_list), 3) |
|
if ( |
|
( |
|
profile_list[number88] |
|
+ profile_list[number88 + 1] |
|
+ profile_list[number88 + 2] |
|
) |
|
/ 3 |
|
) |
|
> average_gene_density |
|
] |
|
) |
|
|
|
expected_codon_density = float(num_codon) / ( |
|
len(profile_list) / 3 |
|
) |
|
|
|
peptide_start = 0 |
|
for sliding_w_n in range( |
|
0, len(elongation_region_part), 3 |
|
): |
|
amino_window = str(peptide_sequence[peptide_start : peptide_start + 60]) |
|
if len(set(amino_window) - set(amino_acids)) != 0: |
|
peptide_start += 1 |
|
continue |
|
|
|
if ( |
|
profile_list[sliding_w_n] |
|
+ profile_list[sliding_w_n + 1] |
|
+ profile_list[sliding_w_n + 2] |
|
) / 3 > average_gene_density: |
|
for number in range(0, 59): |
|
amino_acid_2 = amino_window[number : number + 2] |
|
dipeptide_enrichment_dict[amino_acid_2][number][0] += 1 |
|
dipeptide_enrichment_dict[amino_acid_2][number][1] += 1 |
|
else: |
|
for number in range(0, 59): |
|
amino_acid_2 = amino_window[number : number + 2] |
|
dipeptide_enrichment_dict[amino_acid_2][number][0] += 1 |
|
|
|
amino_acid_2 = amino_window[40:42] |
|
dipeptide_enrichment_expected_dict[amino_acid_2].append( |
|
expected_codon_density |
|
) |
|
peptide_start += 1 |
|
|
|
alignment_filename = args.alignment.split("/")[-1] |
|
if not os.path.exists(args.Path): |
|
os.mkdir(args.Path) |
|
|
|
outfile = open( |
|
"%s/RUST_dipeptide_file_%s_%s_%s" |
|
% (args.Path, alignment_filename, args.offset, length_values), |
|
"w", |
|
) |
|
outfile.write("dipeptide, expected value") |
|
for number106 in range(-40, 20): |
|
outfile.write(", %s" % number106) |
|
outfile.write("\n") |
|
|
|
list_codons = [] |
|
list_amino_acids = list(dipeptide_enrichment_dict.keys()) |
|
list_amino_acids.sort() |
|
rust_expected = [] |
|
rust_observed_metafootprint = [] |
|
for amino2 in list_amino_acids: |
|
if amino2 in list_codons: |
|
continue |
|
list_codons.append(amino2) |
|
outfile.write("%s" % amino2) |
|
if dipeptide_enrichment_expected_dict[amino2] != []: |
|
outfile.write( |
|
", %s" % mean_value(dipeptide_enrichment_expected_dict[amino2]) |
|
) |
|
list_data = [] |
|
for number in range(0, 60): |
|
if dipeptide_enrichment_dict[amino2][number][0] != 0: |
|
outfile.write( |
|
", %s" |
|
% ( |
|
dipeptide_enrichment_dict[amino2][number][1] |
|
/ dipeptide_enrichment_dict[amino2][number][0] |
|
) |
|
) |
|
list_data.append( |
|
dipeptide_enrichment_dict[amino2][number][1] |
|
/ dipeptide_enrichment_dict[amino2][number][0] |
|
) |
|
else: |
|
outfile.write(", 0") |
|
list_data.append(0) |
|
outfile.write("\n") |
|
rust_expected.append(mean_value(dipeptide_enrichment_expected_dict[amino2])) |
|
rust_observed_metafootprint.append(list_data) |
|
|
|
rust_expected_sum = sum(rust_expected) |
|
q_values = [n / rust_expected_sum for n in rust_expected] |
|
|
|
shannon_values = [] |
|
for loc_i in range(60): |
|
rust_observed = [n[loc_i] for n in rust_observed_metafootprint] |
|
rust_observed_sum = sum(rust_observed) |
|
rust_observed_min = min(rust_observed) |
|
if rust_observed_min == 0: |
|
shannon_values.append("NA") |
|
else: |
|
p_values = [n / rust_observed_sum for n in rust_observed] |
|
shannon = [] |
|
list_normalised = [] |
|
for p_value, q_value in zip(p_values, q_values): |
|
shannon.append(abs(p_value * math.log((p_value / q_value), 2))) |
|
list_normalised.append(p_value / q_value) |
|
shannon_values.append(sum(shannon)) |
|
|
|
outfile.write("\nKullback Leibler divergence,") |
|
for value in shannon_values: |
|
outfile.write(", %s" % value) |
|
outfile.close() |
|
|
|
try: |
|
mpl.rcParams["xtick.direction"] = "out" |
|
mpl.rcParams["ytick.direction"] = "out" |
|
mpl.rcParams["legend.fontsize"] = 10 |
|
mpl.rcParams["ytick.labelsize"] = 10 |
|
mpl.rcParams["xtick.labelsize"] = 10 |
|
mpl.rcParams["font.size"] = 10 |
|
mpl.rcParams["axes.titlesize"] = 10 |
|
mpl.rcParams["legend.frameon"] = 0 |
|
mpl.rcParams["axes.axisbelow"] = False |
|
mpl.rcParams["xtick.major.pad"] = 2.0 |
|
mpl.rcParams["ytick.major.pad"] = 2 |
|
mpl.rcParams["xtick.major.size"] = 2.0 |
|
mpl.rcParams["ytick.major.size"] = 2 |
|
mpl.rcParams["axes.linewidth"] = 0.5 |
|
mpl.rcParams["ytick.major.width"] = 0.25 |
|
mpl.rcParams["xtick.major.width"] = 0.25 |
|
mpl.rcParams["lines.linewidth"] = 1 |
|
mpl.rcParams["legend.borderpad"] = 0.01 |
|
mpl.rcParams["legend.labelspacing"] = 0.05 |
|
mpl.rcParams["legend.columnspacing"] = 0.5 |
|
mpl.rcParams["legend.borderaxespad"] = 0.15 |
|
mpl.rcParams["legend.handlelength"] = 1 |
|
|
|
fig = plt.figure(figsize=(6.69, 6.0)) |
|
infileopen = open( |
|
"%s/RUST_dipeptide_file_%s_%s_%s" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
ax1_metafootprint = fig.add_subplot(111) |
|
RUST_metagene_plot(infileopen, ax1_metafootprint) |
|
plt.savefig( |
|
"%s/RUST_dipeptide_metafootprint_%s_%s_%s.png" |
|
% (args.Path, alignment_filename, args.offset, length_values) |
|
) |
|
|
|
except: |
|
sys.stdout.write("Error producing images\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser( |
|
description="Produces RUST metagene profile of dipeptides" |
|
) |
|
parser.add_argument("--version", action="version", version="%(prog)s 1.2") |
|
|
|
parser.add_argument( |
|
"-t", |
|
"--transcriptome", |
|
help="fasta file of transcripts, CDS start and end may be provided on description line using tab separation e.g. >NM_0001 10 5000, otherwise it searches for longest ORF" |
|
", required=True", |
|
) |
|
parser.add_argument( |
|
"-a", |
|
"--alignment", |
|
help="sorted bam file of transcriptome alignments", |
|
required=True, |
|
) |
|
parser.add_argument("-o", "--offset", help="nucleotide offset to A-site", type=int) |
|
parser.add_argument( |
|
"-l", |
|
"--lengths", |
|
help="lengths of footprints included, for example 28:32 is 28,29,30,31,32", |
|
) |
|
parser.add_argument( |
|
"-P", |
|
"--Path", |
|
help='path to outputfile, default is "dipeptide"', |
|
default="dipeptide", |
|
) |
|
args = parser.parse_args(None) |
|
main(args) |
|
|