import sys, os, numpy
from collections import defaultdict
from copy import deepcopy
import pandas
from jade.nnk import NNKAbMaturation
[docs]class NNKEnrichments(object):
"""
Simple class that holds all the enrichment data for a particular class, antibody, and antigen.
"""
def __init__(self, data_dir, zeros = -2.0, class_type = 'VRC01', antibody = 'glCHA31', antigen = 'GT81', sort = 'S1'):
"""
:param data_dir: the directory with sort data. Each set of data + antibody should be in a separate directory (Ex: glCHA31, et.c)
:param zeros: The number we use when enrichment of top/bottom gate is zero. This is the log(enrichment). -2.0 corresponds to an enrichment of about .08.
:param class_type:
:param antibody:
:param antigen:
:param sort:
"""
data_loader = NNKAbMaturation.GetNNKData(data_dir, antibody)
self.df = data_loader.get_2D_data_freq_nnk_data(antigen=antigen, sort=sort)
self.df = self.df.applymap(numpy.log)
self.df = self.df.replace(numpy.NINF, float(zeros))
if not isinstance(self.df, pandas.DataFrame): sys.exit()
self.data_1D = data_loader.get_1d_data_tuple_freq_nnk_data(antigen=antigen, sort=sort)
[docs] def max(self, position):
"""
Get the maximum enrichment at a particular position, and the amino acid
:param position:
:return:
"""
max_index = self.df[str(position)].idxmax()
#print repr(max_index)
return self.value(position, max_index), max_index
[docs] def min(self, position):
"""
Get the minimum enrichment at a particular position, and the amino acid
Note: There may be multiple minumum amino acids - this is not yet accounted for!
:param position:
:return:
"""
min_index = self.df[str(position)].idxmin()
return self.value(position, min_index), min_index
[docs] def value(self, position, three_letter_code):
"""
Get the enrichment value of a particular position and code.
:param position:
:param three_letter_code:
:return:
"""
return self.df.get_value(three_letter_code, str(position))
[docs] def mean(self, position):
return numpy.mean(self.df[str(position)])
[docs] def calculate_factors(self):
"""
Return a dataframe of calculated factors
Factor is Sergeys definition:
(P-M)/MAD = scaling factor; where
P - total propensity for amino acid at this position,
M - mean total propensity for all amino acids at this position
MAD - mean average deviation for propensities at this position.
:rtype: pandas.DataFrame
"""
factors = deepcopy(self.df)
for pos in self.df.columns:
m = self.df[pos].mean()
mad = numpy.absolute(self.df[pos] - m).mean()
if mad == 0:
factors[pos] = 0
return factors
[docs]def combine_enrichments( list_of_nnk_enrichments, additive_combine = False):
"""
Combine a list of nnk_enrichments to populate this one.
@type list_of_nnk_enrichments: [NNKEnrichments]
:rtype: NNKEnrichments
"""
print "Len: "+repr(len(list_of_nnk_enrichments))
if len(list_of_nnk_enrichments) == 1:
return list_of_nnk_enrichments[0]
new_enrich = deepcopy(list_of_nnk_enrichments[0])
dfs_2D = []
dfs_1D = []
for enrich in list_of_nnk_enrichments:
#print len(enrich.df.columns)
dfs_2D.append(enrich.df)
dfs_1D.append(enrich.data_1D)
df_2D = pandas.concat(dfs_2D)
if additive_combine:
new_enrich.df = df_2D.groupby(level=0).sum()
new_enrich.data_1D = numpy.sum(numpy.array(dfs_1D), axis=0)
else:
new_enrich.df = df_2D.groupby(level=0).mean()
new_enrich.data_1D = numpy.mean(numpy.array(dfs_1D), axis=0)
return new_enrich