Source code for jade.basic.sequence.SequenceStats

#Author: Jared Adolf-Bryfogle

import sys

from jade.basic.RestypeDefinitions import RestypeDefinitions


[docs]class SequenceStats: """ Class for getting data from an array of strings of sequences (one letter code) of equal length. """ def __init__(self, sequence_list,): self.sequence_list = sequence_list if len(self.sequence_list)==0: print "No sequences found." return self.stats = []; #Vector of a map of each amino acid self.freq = []; #Same as stats self.aas = RestypeDefinitions().get_all_one_letter_codes() self.init_data_map() self.compute_stats()
[docs] def set_sequences(self, sequence_list): """ Set a sequence list """ self.sequence_list = sequence_list
self.compute_stats()
[docs] def get_probability(self, position, aa): ''' Get probability of the current position (starting from 0) and aa ''' try: if not self.stats[position].has_key(aa): return 0 except IndexError: sys.exit("Position not found in stats. This is bad. We have a problem")
return self.stats[position][aa]
[docs] def get_consensus_sequence(self): consensus = "" for position in range(0, len(self.stats)): pos = position aa_letter = "" for aa in self.stats[pos]: prob = self.stats[pos][aa] if prob > .9: aa_letter = aa.upper() break elif prob > .2: aa_letter = aa.lower() break else: aa_letter = '-' continue consensus = consensus+aa_letter
return consensus
[docs] def get_frequency(self, position, aa):
return self.freq[position][aa] ##Private Methods##
[docs] def compute_stats(self): """ Compute frequency and probability (0-1) for each position for each amino acid """ if len(self.sequence_list)==0:return #print "Total Unique CDR Sequences: "+repr(len(self.sequence_list)) for position in range(0, len(self.sequence_list[0])): totals = self.return_initialized_total_map() for seq in self.sequence_list: totals[seq[position]]+=1 for aa in totals: self.stats[position][aa] = totals[aa]/float(len(self.sequence_list))
self.freq[position][aa] = totals[aa]
[docs] def init_data_map(self): """ Sets all probabilities 0 and appends each map to the stats vector """ for i in range(1, len(self.sequence_list[0])+1): prob = dict() freq = dict() for aa in self.aas: prob[aa] = 0 freq[aa] = 0 self.stats.append(prob)
self.freq.append(freq)
[docs] def return_initialized_total_map(self): totals = dict() for aa in self.aas: totals[aa] = 0
return totals