#!/usr/bin/python
#Author: Jared Adolf-Bryfogle (jadolfbr@gmail.com)
#This Script parses the log of MPI-run antibody_design benchmarks to get the frequency of clusters and lengths against a native database.
#It also gets DB frequences from our main databases and will soon parse the resultant length and cluster recoveries to determine the
# Risk Ratio of each experiment for conclusive statistics.
import gzip,os,sys,re,glob,pandas
try:
import sqlite3
import numpy
except ImportError:
print "Cannot import SQLITE3 or NUMPY. Pleas install to use pool data"
from collections import defaultdict
from optparse import OptionParser
import jade.RAbD_BM.tools_ab_db as pyig_tools
import jade.RAbD_BM.tools_features_db as feat_tools
from jade.rosetta_jade.BenchmarkInfo import BenchmarkInfo
from jade.RAbD_BM.AnalysisInfo import AnalysisInfo
from jade.RAbD_BM.AnalysisInfo import NativeInfo
from jade.basic.structure.Structure import AntibodyStructure
from jade.basic.structure.Structure import cdr_names
from jade.basic.structure.Structure import heavy_cdr_names
from jade.basic.numeric import *
[docs]class AnalyzeRecovery:
"""
Pools Recovery and RR data, outputs to DB
"""
def __init__(self, pyig_design_db_path, analysis_info, native_info, cdrs = None):
"""
Class for anlyzing length and cluster recovery and risk ratios.
Optionally pass in CDRs to control which CDRs are analyzed.
Otherwise, we read the info from RUN_SETTINGS.txt (which is used for benchmarking), which either has ALL or a specific CDR.
:param pyig_design_dbpath: str
:param analysis_info: AnalysisInfo
:param native_info: NativeInfo
:param cdrs: [str]
"""
cdrs = ["L1", "L2", "L3", "H1", "H2"]
if not isinstance(analysis_info, AnalysisInfo): sys.exit()
if not isinstance(native_info, NativeInfo): sys.exit()
self.pyig_db_path = pyig_design_db_path
self.analysis_info = analysis_info
self.native_info = native_info
self.lambda_kappa_dict = defaultdict()
self.lambda_kappa_dict["lambda"] = native_info.lambda_pdbids
self.lambda_kappa_dict["kappa"] = native_info.kappa_pdbids
## Set the CDRs we will need. (WE NEED TO MAKE SURE WE HAVE H# OR NOT!!!)
if cdrs:
self.cdrs = cdrs
elif self.analysis_info.bm_info.settings["CDR"] == "ALL":
self.cdrs = cdr_names
else:
self.cdrs = [self.analysis_info.bm_info.settings["CDR"]]
self.recovery_types = ["length", "cluster"]
##Create empty result dataframes.
#Final Data
self.recovery_df = pandas.DataFrame()
self.observed_df = pandas.DataFrame()
self.represented_df = pandas.DataFrame()
##Initialize the calculators
self.recovery_calc = TopRecoveryCalculator(self.native_info.get_features_db())
self.observed_calc = ObservedRecoveryCalculator(self.native_info.get_features_db())
self.initialize()
[docs] def initialize(self):
"""
Initialize ALL input data before calculating and outputing everything.
"""
if not os.path.exists(self.native_info.get_features_db()):
sys.exit("native_db path not good: "+self.native_info.get_features_db())
self.recovery_df = self.recovery_calc.apply(
self.analysis_info.get_exp(),
self.native_info.pdbids,
self.cdrs,
self.analysis_info.get_features_db())
self.observed_df = self.observed_calc.apply(
self.analysis_info.get_exp(),
self.native_info.pdbids,
self.cdrs,
self.analysis_info.get_decoy_dir())
[docs] def apply(self, db_path, drop_tables = False):
"""
Calculate and Output all the data to the given database.
:param db_path: str
"""
print "opening "+db_path
db_con = sqlite3.connect(db_path)
table_action = "replace" if drop_tables else "append"
if drop_tables:
for table in ["full_data", "by_cdr_data", "by_exp_data"]:
db_con.execute("DROP TABLE IF EXISTS "+table)
full_df = calculate_recovery_and_risk_ratios(self.recovery_df,self.observed_df)
cdr_df = calculate_per_cdr_rr_and_recovery(self.analysis_info.get_exp(), self.cdrs, full_df)
exp_df = calculate_exp_rr_and_recovery(self.analysis_info.get_exp(), full_df)
#Write them all out to sql
full_df.to_sql("full_data", db_con, if_exists=table_action)
cdr_df.to_sql("by_cdr_data", db_con, if_exists=table_action)
exp_df.to_sql("by_exp_data", db_con, if_exists=table_action)
db_con.close()
######### Recovery Calculators
[docs]class RecoveryCalculator(object):
def __init__(self, native_db_path):
"""
Super simple base class for recovery calculators.
"""
self.all_cdrs = cdr_names
self.heavy_cdrs = heavy_cdr_names
self.Antibody = AntibodyStructure()
print "Opening "+native_db_path
self.native_df = feat_tools.get_cdr_cluster_df(native_db_path)
#print "Native Dataframe: "
#print self.native_df
#IMPLEMENT THIS:
#def apply(self):
# """
# Each calculator implements its own version of apply (which may take different arguments.
# Pythons argument dictionary is horrible, and I hate it, so instead, this is a comment
#
# Each calculator also returns the resultant dataframe.
#
# :return: pandas.DataFrame
# """
# pass
[docs]class TopRecoveryCalculator(RecoveryCalculator):
def __init__(self, native_db_path):
"""
Calculate length and cluster recovery from the features database and native database,
which has the cluster and length info.
"""
RecoveryCalculator.__init__(self, native_db_path)
self.bm_df = pandas.DataFrame
[docs] def apply(self, exp_name, pdbids, cdrs, bm_db_path, output_dir = "data" ):
"""
Calculate length and cluster recoveries. Store them the same way we used to for the recovery parser.
Returns the resulting dataframe of recoveries.
:rtype: pandas.DataFrame
"""
#Reset Data.
self.bm_df = pandas.DataFrame
#Calculate
exp_name = exp_name
print "Getting BM data: "+bm_db_path
if not os.path.exists(bm_db_path):
sys.exit("BM DB does not exist!")
bm_df = feat_tools.get_cdr_cluster_df(bm_db_path)
#print self.native_df
flat_dict = defaultdict(list)
for pdbid in pdbids:
for cdr in cdrs:
print pdbid+" "+cdr
native_length = feat_tools.get_length(self.native_df, pdbid, cdr)
native_cluster = feat_tools.get_cluster(self.native_df, pdbid, cdr)
#print "Native: "+repr(native_length)
#print "Cluster: "+repr(native_cluster)
total_entries = feat_tools.get_total_entries(bm_df, pdbid, cdr)
length_recovery = feat_tools.get_length_recovery(bm_df, pdbid, cdr, native_length)
cluster_recovery = feat_tools.get_cluster_recovery(bm_df, pdbid, cdr, native_cluster)
#print "length_recovery: "+str(length_recovery)
#print "cluster_recovery: "+str(cluster_recovery)
flat_dict['length_recovery_freq'].append(length_recovery)
flat_dict['cluster_recovery_freq'].append(cluster_recovery)
flat_dict['total_entries'].append(total_entries)
flat_dict['pdbid'].append(pdbid)
flat_dict['cdr'].append(cdr)
flat_dict['exp'].append(exp_name)
self.bm_df = bm_df
result_df = pandas.DataFrame.from_dict(flat_dict)
##Convert into something easily converted to pandas and output CSV.
print "Recoveries calculated."
self.native_df.to_csv(output_dir+"/native_clusters.csv")
columns = ['exp', 'pdbid', 'cdr', 'length_recovery_freq', 'cluster_recovery_freq', 'total_entries']
out = output_dir+"/all_recoveries.csv"
if os.path.exists(out):
ftype = 'a'
header = False
else:
ftype = 'w'
header = True
OUTFILE = open(out, ftype)
result_df.to_csv(OUTFILE, columns=columns, header=header)
OUTFILE.close()
return result_df
[docs]class ObservedRecoveryCalculator(RecoveryCalculator):
def __init__(self, native_db_path):
"""
Calculates the number of times the native clusters and lengths were observed during the experiment, for each PDB.
"""
RecoveryCalculator.__init__(self, native_db_path)
[docs] def apply(self, exp_name, pdbids, cdrs, bm_decoy_path, output_dir = "data"):
"""
Calculates the number of times the native clusters and lengths were observed during the experiment, for each PDB.
Returns the resulting dataframe.
:rtype: pandas.DataFrame
"""
flat_dict = defaultdict(list)
data_types = ["total_grafts", "native_lengths_observed", "native_clusters_observed"]
for pdbid in pdbids:
print pdbid
totals = defaultdict()
clusters = defaultdict()
lengths = defaultdict()
##Initialize native lengths and clusters to do the check.
for cdr in cdrs:
clusters[cdr] = feat_tools.get_cluster(self.native_df, pdbid, cdr)
lengths[cdr] = feat_tools.get_length(self.native_df, pdbid, cdr)
##Initialize the dictionary that we will use to do the counts. This dictionary is nessessary to do the counts!
totals[cdr] = defaultdict()
totals[cdr]["total_grafts"] = 0
totals[cdr]["native_lengths_observed"] = 0
totals[cdr]["native_clusters_observed"] = 0
filenames = get_decoys(bm_decoy_path, pdbid)
for filename in filenames:
total_grafts = 0
if os.path.basename(filename).split('.')[-1] == "gz":
INFILE = gzip.open(filename, 'rb')
else:
INFILE = open(filename, 'r')
for line in INFILE:
line = line.strip()
if not line or line.startswith("ATOM"): continue
if re.search(" DATA GRAFT_CLOSURE ", line):
total_grafts+=1
lineSP = line.split()
data_index = 0
for i in range(0, len(lineSP)):
if lineSP[i]=="DATA":
data_index = i
break
input_tag = lineSP[data_index+3]
cluster = lineSP[data_index+5]
cdr = cluster.split("-")[0]
length = int(cluster.split("-")[1])
# Now, Add the data to self.exp_data
#print "Adding data for input_tag "+input_tag
#print "Adding data for cdr " +cdr
totals[cdr]["total_grafts"] += 1
if length == lengths[cdr]:
totals[cdr]["native_lengths_observed"] += 1
if cluster == clusters[cdr]:
totals[cdr]["native_clusters_observed"] += 1
#print "grafts: "+repr(total_grafts)
INFILE.close()
###Add to flattened dic.
for cdr in cdrs:
flat_dict["pdbid"].append(pdbid)
flat_dict["cdr"].append(cdr)
flat_dict["exp"].append(exp_name)
for data_type in data_types :
#print pdbid
#print cdr
#print exp_name
flat_dict[data_type].append(totals[cdr][data_type])
##Now we unflatten dictionary and turn it into a nice dataframe.
columns = ["exp", "pdbid", "cdr"]
columns.extend(data_types)
result_df = pandas.DataFrame.from_dict(flat_dict)
out = output_dir+"/all_observed.csv"
if os.path.exists(out):
ftype = 'a'
header = False
else:
ftype = 'w'
header = True
OUTFILE = open(out, ftype)
result_df.to_csv(OUTFILE, columns=columns, header=header)
return result_df
[docs]class PyIgClassifyDBRepresentationCalculator(RecoveryCalculator):
def __init__(self, native_db_path):
"""
Calculates the number of times lengths and clusters are present in the PyIgClassify database.
:return:
"""
RecoveryCalculator.__init__(self, native_db_path)
[docs] def apply(self, exp_name, cdrs, pyig_db_path, lambda_kappa_dict, output_dir = "data"):
"""
Calculates the number of times lengths and clusters are present in the PyIgClassify database.
:param lambda_kappa_dict : dict-like ['lambda'] = [pdbid,]
:rtype: pandas.DataFrame
"""
flat_dict = defaultdict(list)
cdr_data_df = pyig_tools.get_cdr_data_table_df(pyig_db_path)
for light_gene in lambda_kappa_dict:
for pdbid in lambda_kappa_dict[light_gene]:
for cdr in cdrs:
gene = "heavy" if cdr in self.heavy_cdrs else light_gene
native_length = feat_tools.get_length(self.native_df, pdbid, cdr)
native_cluster = feat_tools.get_cluster(self.native_df, pdbid, cdr)
length_enrichment = pyig_tools.get_length_enrichment(cdr_data_df, gene, cdr, native_length)
cluster_enrichment = pyig_tools.get_cluster_enrichment(cdr_data_df, gene, cdr, native_cluster)
total_entries = pyig_tools.get_total_entries(cdr_data_df, gene, cdr)
flat_dict["exp"].append(exp_name)
flat_dict["pdbid"].append(pdbid)
flat_dict["cdr"].append(cdr)
flat_dict["gene"].append(gene)
flat_dict["length_representation"].append(length_enrichment)
flat_dict["cluster_representation"].append(cluster_enrichment)
flat_dict["total_entries_of_gene_and_cdr"].append(total_entries)
##Now we unflatten dictionary and turn it into a nice dataframe.
columns = ["exp", "pdbid", "gene", "cdr", "length_representation", "cluster_representation", "total_entries_of_gene_and_cdr"]
result_df = pandas.DataFrame.from_dict(flat_dict)
out = output_dir+"/pyigclassify_db_representation.txt"
if os.path.exists(out):
ftype = 'a'
header = False
else:
ftype = 'w'
header = True
OUTFILE = open(out, ftype)
result_df.to_csv(OUTFILE, columns=columns, header=header)
return result_df
####### Combine data and get risk ratios, recovery percentage, etc.
[docs]def calculate_recovery_and_risk_ratios(top_recovery_df, observed_df):
"""
Calculate the Risk Ratio and Recovery Percent for each pdb/cdr given dataframes output by the calculators below.
Return a merged dataframe of the top recovery and observed, with the resulting risk ratio data.
:param top_recovery_df: pandas.DataFrame
:param observed_df: pandas.DataFrame
:rtype: pandas.DataFrame
"""
df = pandas.merge(top_recovery_df, observed_df, how="outer")
for rtype in ["length", "cluster"]:
df[rtype+'_recovery'] = (df[rtype+'_recovery_freq']/df['total_entries'].astype('float'))
df[rtype+'_rr'] = df[rtype+'_recovery']/(df['native_'+rtype+'s_observed']/df['total_grafts'].astype('float'))
return df
[docs]def calculate_per_cdr_rr_and_recovery(exp, cdrs, result_df):
"""
Calculate the recovery and risk-ratios PER CDR.
:rtype: pandas.DataFrame
"""
flat_dict = defaultdict(list)
for cdr in cdrs:
flat_dict['exp'].append(exp)
flat_dict['cdr'].append(cdr)
for rtype in ['length', 'cluster']:
flat_dict[rtype+'_rr'].append(numpy.mean(result_df[(result_df['cdr'] == cdr)][rtype+'_rr']))
flat_dict[rtype+'_recovery'].append(numpy.mean(result_df[(result_df['cdr'] == cdr)][rtype+'_recovery']))
return pandas.DataFrame.from_dict(flat_dict)
[docs]def calculate_exp_rr_and_recovery(exp, result_df):
"""
Calculate the overall recovery and risk ratio.
:param exp:
:param result_df:
:rtype: pandas.DataFrame
"""
flat_dict = defaultdict(list)
flat_dict["exp"].append(exp)
for rtype in ['length', 'cluster']:
flat_dict[rtype+'_rr'].append(numpy.mean(result_df[result_df['exp'] == exp][rtype+'_rr']))
flat_dict[rtype+'_recovery'].append(numpy.mean(result_df[result_df['exp'] == exp][rtype+'_recovery']))
return pandas.DataFrame.from_dict(flat_dict)
#########
[docs]def get_decoys(input_dir, pdbid):
"""
Use GLOB to Match on pdbid for file names in the input dir.
This should skip all the extra PDBs like excn, initial, relax, etc.
:param input_dir: str
:param tag: str
"""
if not os.path.exists(input_dir):
sys.exit("MPI Log Dir does not exist!")
search_name = input_dir+"/*"+pdbid+"*.pdb*"
#print search_name
final_filenames = []
names = glob.glob(search_name)
for pdb in names:
if re.search("initial_benchmark_perturbation", pdb) or re.search("excn", pdb):
continue
else:
final_filenames.append(pdb)
return final_filenames