Source code for jade.rosetta_jade.ScoreFiles

import json

from collections import OrderedDict

import pandas
import ast
import re

from jade.pymol_jade.PyMolScriptWriter import *
from jade.basic.pandas.PandasDataFrame import *
from jade.basic.plotting.MakeFigure import *
from jade.basic.path import *
from jade.utility.string_util import *

##Original Author: Luki Goldschmidt <lugo@uw.edu>
##Forked by Jared Adolf-Bryfogle.
##Has been completely refactored to work with pandas Dataframes

[docs]class ScoreFile: def __init__(self, filename): self.filename = filename if re.search("score_", filename): self.name = ".".join(os.path.basename(self.filename).split('.')[0:-1]).replace('score_', "") else: self.name = ".".join(os.path.basename(self.filename).split('.')[0:-1]).replace('score', "") self.decoys = [] #Main holder of data. Array of dict of scoreterms and 'decoy'] self.decoy_field_name = "decoy" self.decoy_dir = os.path.dirname(filename) #Assume filename is in decoy dir. This is not nessessarily the case... if filename == "-": lines = file(sys.stdin).readlines() else: lines = file(filename).readlines() header = lines[0] headerSP = lines[1].split() #print repr(headerSP) for line in lines: try: o = json.loads(line.replace("nan", "NaN")) # print o[self.decoy_field_name] # print repr(o) self.decoys.append(o) except Exception as e: ##Store as defaultdict instead of JSON. #print "Cannot load as regular JSON file! Parsing as old-school scorefile instead: "+ str(e) d = defaultdict() values = line.split() if len(values) != len(headerSP): if len(values) == 1 and values[0] =="SEQUENCE:": continue print >> sys.stderr, "Failed to parse JSON object or as regular score file; skipping line:\n", line else: for i in range(0, len(values)): k = headerSP[ i ] if k == "description": k = "decoy" if values[i] == "SCORE:": continue d[ k ] = deduce_str_type(values[i]) self.decoys.append(d) #print repr(self.decoys)
[docs] def get_decoy_count(self):
return len(self.decoys)
[docs] def get_decoy_names(self):
return [str(r[self.decoy_field_name]) for r in self.decoys]
[docs] def get_scoreterm_names(self): r = [] for rec in self.decoys: for k in rec.keys(): if k != self.decoy_field_name and not k in r: r.append(str(k)) r.sort()
return r
[docs] def get_scoreterms(self, scoreterms=""): if type(scoreterms) == str: if scoreterms in ["", "*"]: scoreterms = self.get_scoreterm_names() scoreterms.sort() else: scoreterms = scoreterms.split(",") r = OrderedDict() for rec in self.decoys: scores = {} for scoreterm in scoreterms: scores[scoreterm] = rec.get(scoreterm) r[str(rec[self.decoy_field_name])] = scores
return r
[docs] def get_scoreterm(self, scoreterm): r = {} for rec in self.decoys: r[str(rec[self.decoy_field_name])] = rec.get(scoreterm)
return r
[docs] def get_stats(self, scoreterms="", decoy_names = None): print repr(len(decoy_names)) scores = self.get_scoreterms(scoreterms) stats = None if not decoy_names: decoy_names = self.get_decoy_names() #print repr(decoy_names) # Collect data for decoy_name in scores: if not decoy_name in [ os.path.basename(x) for x in decoy_names]: continue decoy = scores[decoy_name] if stats == None: columns = decoy.keys() stats = {k: [] for k in columns} for term in decoy: score = decoy[term] if not score is None and type(score) == float: stats[term].append(score) # Compute Stats calc_stats = OrderedDict() def median(s): s.sort() return s[int(len(s) / 2)] def stddev(s): mean = sum(s) / len(s) r = 0.0 for x in s: r += pow(x - mean, 2) return pow(r / len(s), 0.5) for column in columns: s = stats[column] calc_stats[column] = { 'n': len(s), 'mean': sum(s) / len(s) if len(s) > 0 else None, 'median': median(s) if len(s) > 0 else None, 'stddev': stddev(s) if len(s) > 0 else None, 'min': min(s) if len(s) > 0 else None, 'max': max(s) if len(s) > 0 else None }
return calc_stats
[docs] def get_ordered_decoy_list(self, scoreterm, decoy_names = None, top_n=-1, reverse=False, ): """ Get an ordered tuple of [[score, decoy_name], ...] Will automatically order some known scoreterms (hbonds_int, dSASA_int) :rtype: list[list] """ if not decoy_names: decoy_names = self.get_decoy_names() if scoreterm == "hbonds_int" or scoreterm == "dSASA_int": reverse = True
return sorted([[x[scoreterm], x[self.decoy_field_name]] for x in self.decoys if x[self.decoy_field_name] in decoy_names and scoreterm in x ], reverse=reverse)[:top_n]
[docs] def get_score(self, decoy, scoreterm): """ Get Score of a particular decoy and scoreterm :param decoy: str :param scoreterm: str :rtype: float """ for o in self.decoys: if o[self.decoy_field_name] == decoy:
return o[scoreterm]
[docs] def get_scores(self, scoreterm, decoy_names=None, top_n=-1, reverse=False): ordered_decoy_names = self.get_ordered_decoy_list(scoreterm, decoy_names, top_n, reverse)
return [self.get_score( decoy, scoreterm) for decoy in ordered_decoy_names]
[docs] def get_Dataframe(self, scoreterms=None, order_by="total_score", top_n=-1, reverse=True): """ Get data as a pandas dataframe. Definitely preferred now. :param scoreterms: list :param order_by: str :param top_n: int :param reverse: bool :rtype: pandas.DataFrame """ print self.name #print self.decoys df = pandas.DataFrame.from_dict(self.decoys) df = detect_numeric(df) #df.to_csv("debugging.csv", sep=",") df = df.sort_values(order_by, ascending=reverse)[0:top_n] if scoreterms: df = get_columns(df, scoreterms) df.name = self.name df.scorefile = self.filename df["experiment"] = self.name df["scorefile"] = self.filename if os.path.dirname(self.filename): df["decoy_path"] = os.path.dirname(self.filename)+"/"+df["decoy"] df["decoy_path"] = [get_decoy_path(p) for p in df["decoy_path"]] else: df["decoy_path"] = df["decoy"] df["decoy_path"] = [get_decoy_path(p) for p in df["decoy_path"]]
return df
[docs]def get_scorefiles(indir = os.getcwd()): """ Get Score files from a directory. Walk through all directories in directory. :param indir: str :rtype: list """ matches = [] matches.extend(glob.glob(indir+"/*.sc")) for root, dirnames, filenames in os.walk(indir): for dirname in dirnames: matches.extend( glob.glob(root+"/"+dirname+"/*.sc") )
return matches
[docs]def plot_score_vs_rmsd(df, title, outpath, score="total_score", rmsd="looprms", top_p=.95, reverse=True): """ Plot a typical Score VS RMSD using matplotlib, save it somewhere. Return the axes. By default, plot the top 95% :param df: pandas.DataFrame :param outpath: str :param score: str :param rmsd: str :rtype: matplotlib.Axes """
return plot_x_vs_y_sea_with_regression(df, title, outpath, score, rmsd, top_p=top_p, reverse=reverse)
[docs]def pymol_session_on_top_df(df, outdir, decoy_dir = None, scoreterm="total_score", top_n=10, decoy_column="decoy", native_path = None, out_prefix_override=None, ab_structure = False, superimpose = False, run_pymol = True ): """ Make a PyMol session (or setup a scripter) on top X using a dataframe. Return the scripter for extra control. df should have an attribute of 'name' or out_prefix_override should be set. :param df: pandas.DataFrame :param outdir: str :param decoy_dir: str :param scoreterm: str :param top_n: int :param decoy_column: str :param native_path: str :param out_prefix_override: str :param ab_structure: boolean :param superimpose: boolean :rtype: PyMolScriptWriter """ if not os.path.exists(outdir): os.mkdir(outdir) if out_prefix_override: session_prefix = out_prefix_override elif hasattr(df, "name"): session_prefix = df.name+"_" else: session_prefix = "pml_session_" print "WARNING: out_prefix_override not set and dataframe has no attribute name!" pymol_name = session_prefix+scoreterm print pymol_name ascending = True if scoreterm == "hbonds_int" or scoreterm == "dSASA_int": ascending = False df2 = df.sort(scoreterm, ascending = ascending) df2 = df2[0:top_n] #print df2['total_score'].tail() if decoy_dir: df2['decoy_path'] = decoy_dir+"/"+df2[decoy_column] elif 'decoy_path' in df2.columns: pass else: df2['decoy_path'] = df2[decoy_column] top_decoys = zip(list(df2[scoreterm]), list( df2[ 'decoy_path' ])) #print repr(top_decoys) scripter = make_pymol_session_on_top_scored(top_decoys, outdir, outdir, pymol_name, int(top_n), native_path, ab_structure, parellel=False, super = superimpose, run_pymol = run_pymol)
return scripter