Source code for jade.RAbD.AnalyzeAntibodyDesigns

import sqlite3

import pandas


# PyIgD
from jade.pymol_jade.PyMolScriptWriter import *
from jade.antibody.cdr_data.CDRDataTypes import *
from jade.antibody.decoy_data.DecoyDataTypes import *
from jade.basic.sequence import fasta
from jade.basic.filters.DataFilters import *
from jade.basic.filters.FilterSettings import *
from jade.basic.threading.Threader import *
from jade.basic.pandas import PandasDataFrame
from jade.RAbD_BM.AnalysisInfo import *

# Rosetta Tools
import jade.rosetta_jade.FeaturesJsonCreator as json_creator


[docs]class CompareAntibodyDesignStrategies:
    """
    Class mainly for comparing different Antibody Design strategies using our Features Databases.
    """

    def __init__(self, db_dir, out_dir_name, strategies=[], jsons = []):
        """

        :param db_dir:
        :param out_dir_name:
        :param strategies:
        :param jsons:
        """

        #Init construction options
        self.db_dir = StringVar(value=db_dir)
        self.out_dir_name = StringVar(value=out_dir_name)
        self.native_path = None
        self.strategies = strategies
        self.jsons = jsons

        #Init Classes and data
        self.db_paths = defaultdict()
        self.filter_settings = FilterSettings()

        #Init Components
        self._init_default_options()
        self._init_paths()
        self._init_scores()
        self._init_default_scores()
        self._init_cdrs()



    def _init_default_options(self):

        self.scorefxn = "talaris2014"

        self.main_dir = StringVar()

        self.reference_db = StringVar()
        self.clustal_soft_wrap = IntVar(value=100)
        self.reload_scores = IntVar(value=1)
        self.top_n = IntVar(value=15)
        self.top_n_combined = IntVar(value=15)

        self.features_hbond_set = IntVar()
        self.features_hbond_sets = ["", "_min_hbond_analysis", "_no_hbond_analysis"]
        self.features_hbond_set.set(1)
        self.query_hbonds = IntVar(value=0)

        self.is_camelid = IntVar()
        self.is_camelid.set(0)
        self.top_total_percent = IntVar()
        self.top_total_percent.set(10)
        self.backround_features = IntVar()
        self.backround_features.set(1)

        self.rosetta_extension = StringVar()
        self.rosetta_extension.set("linuxclangrelease")

        self.individual_analysis = IntVar(value = 1)
        self.combined_analysis = IntVar(value = 0)

        self.load_origin_pdbs = IntVar(value = 1)
        self.pyigclassify_dir = StringVar()


    def _init_paths(self):
        self.cdr_rel_path = "DBOUT/cdr_pdbs_redun_by_cdr_overhang_3"
        self.weblogo_rel_path = "DBOUT/weblogos"
        self.ab_db_rel_path = "DBOUT/website"

        self.redun_db_name = "antibody_database_redundant.db"
        self.nr_db_name = "antibody_database_rosetta_design.db"

    def _init_scores(self):
        total_scores = TotalDecoyData()
        dg_scores = dGDecoyData()
        dsasa_scores = dSASADecoyData()
        top10_by_10 = dGTotalScoreSubset()

        #hbonds_int = IntHbondDecoyData()
        sc_value = SCValueDecoyData()
        unsats = DeltaUnsatsPerAreaDecoyData()

        self.scores = [total_scores, dg_scores, dsasa_scores, top10_by_10, sc_value, unsats]

    def _init_default_scores(self):
        #Setup the scores that are on
        self.score_names = self._get_score_names()
        self.scores_on = defaultdict()
        for score_name in self.score_names:
            self.scores_on[score_name] = IntVar(value=0)

        self.scores_on["dG"].set(1)
        self.scores_on["dG_top_Ptotal"].set(1)
        self.scores_on["total"].set(1)
        self.scores_on["delta_unsats_per_1000_dSASA"].set(1)

    def _init_cdrs(self):
        self.cdrs = defaultdict()
        self.cdrs["L1"] = IntVar(value=1)
        self.cdrs["L2"] = IntVar(value=1)
        self.cdrs["L3"] = IntVar(value=1)
        self.cdrs["H1"] = IntVar(value=1)
        self.cdrs["H2"] = IntVar(value=1)
        self.cdrs["H3"] = IntVar(value=1)

    def _get_score_names(self):

        names = []
        for score in self.scores:
            names.append(score.name)
        return names

    def _get_score_names_on(self):
        names = []
        for name in self._get_score_names():
            if self.scores_on[name]:
                names.append(name)
        return names

    def _setup_outdir(self, subdirs=[], use_out_dir_name=True):
        """
        Sets up the main output dir in the main_analys_dir, and any subdirectories such as 'decoys' or decoys/combined_3
        Returns the final output directory
        """

        filters = self._setup_filters()

        # if self.out_dir_name.get() and use_out_dir_name and filters:
        #    outdir = self.main_dir.get()+"/"+self.out_dir_name.get()+"_"+self.filter_settings.name.get()
        # elif self.out_dir_name.get() and use_out_dir_name:
        #    outdir = self.main_dir.get()+"/"+self.out_dir_name.get()
        # elif filters:
        #    outdir = self.out_dir_name.get()+"/"+self.filter_settings.name.get()
        # else:

        outdir = os.getcwd()

        if not os.path.exists(outdir): os.mkdir(outdir)

        for subdir in subdirs:
            if not subdir: continue
            outdir = outdir + "/" + subdir
            if not os.path.exists(outdir): os.mkdir(outdir)

        return outdir

    def _setup_outdir_individual(self, subdirs=[], use_outdir_name=False):

        return self._setup_outdir(["analysis_individual", self.out_dir_name.get()] + subdirs, use_outdir_name)

    def _setup_outdir_combined(self, subdirs=[]):
        return self._setup_outdir(["analysis_combined", self.out_dir_name.get()] + subdirs, False)

    def _setup_cdrs(self):
        """
        Get a list of CDRs to process for [many] non-features related tasks.
        """

        cdrs = []
        camelid_cdrs = ["L1", "L2", "L3"]

        for cdr_name in self.cdrs:
            if self.is_camelid.get() and cdr_name in camelid_cdrs:
                continue

            if self.cdrs[cdr_name].get():
                cdrs.append(cdr_name)

        return cdrs

    def _setup_scores(self, features_type="antibody", use_all=False):
        """
        Setup the Score Classes.  If not use_all, will use only use those set.
        :rtype: list of DecoyData
        """

        self._init_scores()
        score_subset = []
        query_hbonds = self.query_hbonds.get()

        for score in self.scores:
            if score.name == "hbond_count" or score.name == "hbond_energy":

                if self.scores_on[score.name].get():
                    query_hbonds = True

        # Quickly analyze scores with the same settings...
        if not self.reload_scores.get():
            for score_class in self.scores:
                if self.scores_on[score_class.name].get() or use_all:
                    score_subset.append(copy.deepcopy(score_class))
            return score_subset

        hb_loader = InterfaceHBondDecoyDataLoader()

        filters = self._setup_filters()

        if self.is_camelid.get():
            for score in self.scores:
                score.set_interface('H_A')

        if filters:
            self.scores.append(CombinedStrDecoyData(filters, self.filter_settings.name.get()))

        for strategy in self.strategies:
            db_path = self.get_db_path(strategy, features_type)
            if not os.path.exists(db_path):
                sys.exit(
                    "DB path does not exist.  Please Run Features reporter with StructureScores and ScoreTypes for this strategy\n" + db_path)
            print db_path
            con = sqlite3.connect(db_path)

            if query_hbonds:
                if filters:
                    hb_loader.add_filters(filters, self.filter_settings.name.get())
                hb_loader.add_data(strategy, con)

            for score_class in self.scores:

                if filters:
                    score_class.add_filters(filters, self.filter_settings.name.get())

                # We set these up later
                if score_class.name == "hbond_count" or score_class.name == "hbond_energy":
                    continue

                if score_class.name == "dG_top_Ptotal":
                    score_class.add_data(strategy, con, self.top_total_percent.get())
                else:
                    score_class.add_data(strategy, con)

        # Setup each Hbond Data class
        if query_hbonds:
            for score_class in self.scores:
                if score_class.name == "hbond_count" or score_class.name == "hbond_energy":
                    score_class.setup_from_loader(hb_loader)

        for score_class in self.scores:
            if self.scores_on[score_class.name].get() or use_all:
                score_subset.append(copy.deepcopy(score_class))

        return score_subset

    def _get_score(self, score_name):
        """
        Get a particular score class.  May or may not be initialized.  See setup_scores.
        :param score_name:
        :rtype: DecoyData
        """
        for score in self.scores:
            if score.name == score_name:
                return score
        return None

    def _setup_filters(self):
        filters = []

        if self.filter_settings.h3_filter.get():
            filters.append(H3ExtendedFilter())

        if self.filter_settings.extra_required_where:
            custom_filter = DataFilter("custom_filter", type="unknown")
            custom_filter.required_tables = self.filter_settings.extra_required_tables
            custom_filter.required_wheres = self.filter_settings.extra_required_where
            filters.append(custom_filter)

        energy_filters = defaultdict()
        energy_filters["dG"] = dGCutoffFilter(0)
        energy_filters["dSASA"] = dSASACutoffFilter(0)
        energy_filters["total"] = TotalScoreCutoffFilter(0)

        for energy_type in self.filter_settings.energy_types:
            if not self.filter_settings.get_energy_enabled(energy_type): continue

            filter = energy_filters[energy_type]
            filter.set_value(self.filter_settings.get_energy_cutoff(energy_type))
            filters.append(filter)

        return filters

    def _setup_cdr_types(self, data_type, is_camelid, features_type='antibody'):
        if data_type == "length":
            data_class = CDRLengthData(self.native_path, is_camelid)

        elif data_type == "cluster":
            data_class = CDRClusterData(self.native_path, is_camelid)

        elif data_type == "sequence":
            data_class = CDRSequenceData(self.native_path, is_camelid)

        elif data_type == "aligned_sequence":
            data_class = CDRAlignedSequenceData(self._setup_outdir_individual(['clustal']),
                                                self._setup_outdir_combined(['clustal']), self.native_path, is_camelid)

        else:
            sys.exit("data_type not understood!")

        for strategy in self.strategies:
            db_path = self.get_db_path(strategy, features_type)
            if not os.path.exists(db_path):
                sys.exit(
                    "DB path does not exist.  Please Run Features reporter with StructureScores and ScoreTypes for this strategy\n" + db_path)
            print db_path
            con = sqlite3.connect(db_path)
            data_class.add_data(strategy, con)

        return data_class

[docs]    def set_cdrs_from_list(self, cdr_list):
        for name in self.cdrs:
            if name in cdr_list:
                self.cdrs[name].set(1)
            else:
                self.cdrs[name].set(0)

[docs]    def set_strategies_from_db_dir_top_dir(self):
        dirs = glob.glob(self.db_dir.get() + "/TOP_*")
        non_redun = defaultdict()
        for d in sorted(dirs):
            # print d
            non_redun["_".join(d.split("_")[3:])] = None

        for x in sorted(non_redun.keys()):
            # print x
            self.strategies.append(x)

[docs]    def set_strategies_from_databases(self):
        """
        Set the strategies from the db_dir/databases directory
        :return:
        """
        if not os.path.exists(self.db_dir.get()):
            print "Could not find any databases to use for analysis.  Please make sure databases are in " + self.db_dir.get()
            print "Databases should end with .db or .db3 extension.  Naming format of databases: strategy.features_type.scorefunction.db"
            print "WTF?"
            return

        dbs = glob.glob(self.db_dir.get() + "/*.db*")

        if len(dbs) == 0:
            print "Could not find any databases to use for analysis.  Please make sure databases are in " + self.db_dir.get()
            print "Databases should end with .db or .db3 extension.  Naming format of databases: strategy.features_type.scorefunction.db"
            return

        print repr(dbs)
        nr_dbs = defaultdict()
        for db in sorted(dbs):
            print db
            x = os.path.basename(db)

            ##Example naming convention: 'ch103_5_CDR_prelim.norm_ab_features.db'
            strategy = '.'.join(x.split('.')[:-2])
            print strategy
            if len(x.split('.')[-2].split('_')) > 2:
                strategy = strategy + '.' + '_'.join(x.split('.')[-2].split('_')[:-2])
                print strategy

            if not nr_dbs.has_key(strategy):

                self.strategies.append(strategy)
                nr_dbs[strategy] = " "
                self.db_paths[strategy] = []
                self.db_paths[strategy].append(db)
            else:
                self.db_paths[strategy].append(db)

[docs]    def set_strategies_from_json_infos(self):
        """
        Uses self.json, which are AnalysisInfo classes, to populate.

        :return:
        """

        if not self.jsons:
            print "JSONS NOT SET!"
            return

        nr_dbs = defaultdict()
        for info in self.jsons:
            if not isinstance(info, AnalysisInfo): sys.exit()

            db = info.get_features_db()
            print db

            ##Example naming convention: 'ch103_5_CDR_prelim.norm_ab_features.db'
            strategy = info.get_exp()
            print strategy

            if not nr_dbs.has_key(strategy):

                self.strategies.append(strategy)
                nr_dbs[strategy] = " "
                self.db_paths[strategy] = []
                self.db_paths[strategy].append(db)
            else:
                self.db_paths[strategy].append(db)

[docs]    def set_strategies(self, strategies):
        self.strategies = strategies

[docs]    def get_strategies(self):
        return self.strategies

[docs]    def get_db_path(self, strategy, features_type='antibody'):

        names = {
            'antibody': ['ab', 'antibody'],
            'cluster': ['cl', 'cluster', 'ab', 'antibody']
        }

        for p in self.db_paths[strategy]:
            for match in names[features_type]:
                if re.search(match, p):
                    return p

        print "Matching database name not found for features type: " + features_type
        print "Database must have any of these names in them: " + repr(names[features_type])

[docs]    def get_full_features_type(self, type):
        if type == "cluster":
            return type
        else:
            return type + "_minimal" + self.features_hbond_sets[self.features_hbond_set.get()]

    ################ Main Functions #######################
[docs]    def run_features(self, type, plot_name=""):

        if not plot_name:
            plot_name = self.out_dir_name.get()

        if not plot_name:
            print "No root name set!"
            return

        os.system("rm build")
        outdir = self._setup_outdir(["plots", plot_name], False)
        db_dir = self.db_dir.get()

        if len(self.strategies) == 0:
            return

        if not os.path.exists(db_dir): sys.exit("Please run Features reporter and copy databases to db directory.")

        fulltype = self.get_full_features_type(type)
        creator = json_creator.JsonCreator(outdir, fulltype)

        if self.reference_db.get() and os.path.exists(self.reference_db.get()):
            creator.add_sample_source_info(self.reference_db.get(), "ref", True)

        for strategy in self.strategies:
            id = strategy
            id = id.replace("talaris2013", "talaris")

            # Fix up the name so it is not too long.  Will add options to do this manually later:
            """
            id = id.replace("cluster", "clus")
            id = id.replace("exclude", "excl")
            id = id.replace("include", "incl")

            """

            db_path = self.get_db_path(strategy, type)
            if not os.path.exists(db_path):
                sys.exit(db_path + " does not exist!")

            creator.add_sample_source_info(db_path, id)

        creator.save_json(outdir + "/" + type + "_" + plot_name + ".json")
        print "Creating: "+outdir + "/" + type + "_" + plot_name + ".json"
        creator.run_json(self.backround_features.get())

        # pwd = os.getcwd()
        # os.chdir("build")
        # os.system("cp -r *../"+outdir)
        # os.system("mv build build_old")
        # os.chdir(pwd)

        print "Plots ignore any set filters.  To plot with filters, create new databases through query..."
        print "Complete..."

[docs]    def get_pandas_dataframe(self):
        """
        Gets a pandas Dataframe for all
        :rtype: pandas.DataFrame
        """
        dfs = []
        output_names = ["strategy"] #Controls the order of the output names.

        for score in self._setup_scores(use_all=True):
            if score.name == "dG_top_Ptotal":continue

            if isinstance(score, DecoyData): pass

            output_names.append(score.name)

            df = score.get_pandas_dataframe()
            dfs.append(df)

        cdr_types = ["length", "cluster", "sequence", "aligned_sequence"]

        for t in cdr_types:
            cdr_data = self._setup_cdr_types(t, self.is_camelid.get())

            cdr_names = [cdr for cdr in cdr_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                output_names.append("_".join([cdr, cdr_data.name]))
            df = cdr_data.get_pandas_dataframe(cdr_names)
            dfs.append(df)

        df = PandasDataFrame.drop_duplicate_columns(pandas.concat(dfs, axis=1, join="outer"))
        return df

[docs]    def get_top_from_dataframe(self, score_name):
        """
        Gets a pandas Dataframe for top
        :rtype: pandas.DataFrame
        """
        df = self.get_pandas_dataframe()
        dfs = []
        for strategy in self.get_strategies():
            dfs.append(df["strategy" == strategy].sort_values(score_name)[0:self.top_n.get()-1])

        df = PandasDataFrame.drop_duplicate_columns(pandas.concat(dfs))
        return df

[docs]    def get_top_dataframe_by_all_scores(self):
        """
        Get a pandas DataFrame for top, grouped by the type of score that is on.
        :rtype: pandas.DataFrame
        """
        dfs = []
        score_names = self._get_score_names_on()
        for score_name in score_names:
            if score_name == "dG_top_Ptotal":continue
            df = self.get_top_from_dataframe(score_name)
            df["by_score_group"] = score_name
            dfs.append(df)
        df = PandasDataFrame.drop_duplicate_columns(pandas.concat(dfs))
        return df

[docs]    def output_all_data_as_excel_file(self, top = True):
        final_dfs = []
        final_tab_names = []


        #All Data:
        dfs, names = self.get_csv_data(top, summary = False)
        final_dfs.extend(dfs)
        final_tab_names.extend(names)

        #Summary Data:
        dfs, names = self.get_csv_data(top, summary = True)
        final_dfs.extend(dfs)
        final_tab_names.extend(dfs)





[docs]    def output_csv_data(self, top = False, summary = False):
        """
        Output a CSV file of combined or individual data.

        """
        output_dfs, output_names = self.get_csv_data(top, summary)
        for index, df in enumerate(output_dfs):
            name = output_names[index]
            df.to_csv(name+".csv")

            print "Wrote: "+name+".csv"

[docs]    def get_csv_data(self, top = False, summary = False):
        """
        Get data by converting everything to a pandas dataframe first.
        For now, one function pretty much does everything.

        :rtype: [pandas.Dataframe],[str]

        """
        final_dfs = []
        final_names = []

        dfs = []
        venn_dfs = [] #Best decoys/data seen in all score classes of the top n together.
        output_names = ["strategy"] #Controls the order of the output names.

        venn2_cat = ['dG', 'total']
        venn2_dfs = [] #Venn on dG and Total score.
        for score in self._setup_scores(use_all=True):
            if score.name == "dG_top_Ptotal":continue

            if isinstance(score, DecoyData): pass


            output_names.append(score.name)


            venn_dfs.append(score.get_pandas_dataframe(top_n=self.top_n.get()))
            df = score.get_pandas_dataframe()
            print df.tail()
            dfs.append(df)
            if score.name in venn2_cat:
                venn2_dfs.append(df)

        cdr_types = ["length", "cluster", "sequence", "aligned_sequence"]

        for t in cdr_types:
            cdr_data = self._setup_cdr_types(t, self.is_camelid.get())

            cdr_names = [cdr for cdr in cdr_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                output_names.append("_".join([cdr, cdr_data.name]))
            df = cdr_data.get_pandas_dataframe(cdr_names)
            #df.to_csv(self._setup_outdir_individual()+"/test_cdr_"+cdr_data.name+".csv")
            dfs.append(df)
            venn_dfs.append(df)
            venn2_dfs.append(df)


        combined_scores = PandasDataFrame.drop_duplicate_columns(pandas.concat(dfs, axis=1, join="outer")) #Drop Duplicates
        combined_scores = combined_scores[output_names]
        combined_scores = combined_scores.apply(pandas.to_numeric, errors='ignore')
        combined_scores.index.name = "decoy"

        if top:
            venn_df = PandasDataFrame.drop_duplicate_columns(pandas.concat(venn_dfs, axis=1, join="inner"))
            venn_df = venn_df[output_names]
            venn_df.sort_values(['strategy', 'dG'])
            #venn_df.to_csv(open(self._setup_outdir_individual()+"/ind_per_model_ven_top_"+str(self.top_n.get())+".csv", "w"))

            venn2_df = PandasDataFrame.drop_duplicate_columns(pandas.concat(venn_dfs, axis=1, join="inner"))
            venn2_df = venn2_df[output_names]
            venn2_df.sort_values(['strategy', 'dG'])
            #venn2_df.to_csv(open(self._setup_outdir_individual()+"/ind_per_model_ven_dG_total_top_"+str(self.top_n.get())+".csv", "w"))

            score_dfs=[]
            score_names = self._get_score_names_on()
            for score_name in score_names:
                if score_name == "dG_top_Ptotal":
                    sort_name = 'dG'
                else:
                    sort_name = score_name

                strat_dfs=[]
                for strategy in self.get_strategies():

                    """
                    if score_name == "dG_top_Ptotal":
                        score = self._get_score(score_name)
                        decoy_list = score.get_ordered_decoy_list(strategy, self.top_n.get())
                        df = combined_scores[combined_scores.index.isin(decoy_list)]
                        df.sort(columns=[sort_name]) #This sort is not working, have no idea why.
                    else:
                        df = combined_scores[combined_scores['strategy'] == strategy].sort(score_name)[0:self.top_n.get()] #Best N
                        df.sort(columns=[sort_name])
                    """
                    #Generally, will use order to get top scores.
                    score = self._get_score(score_name)
                    decoy_list = score.get_ordered_decoy_list(strategy, self.top_n.get())
                    df = combined_scores[combined_scores.index.isin(decoy_list)]
                    df.sort_values([sort_name])

                    strat_dfs.append(df)
                top_df = pandas.concat(strat_dfs)

                top_df["by_score_group"] = score_name
                score_dfs.append(top_df)

            top_df = PandasDataFrame.drop_duplicate_columns(pandas.concat(score_dfs))
            name_order=["by_score_group"]
            name_order.extend(output_names)
            top_df = top_df[name_order]

            if self.individual_analysis.get():

                if summary:
                    top_df = top_df.apply(pandas.to_numeric, errors='ignore')

                    final_dfs.append(top_df.groupby(by=["strategy", "by_score_group"]).describe(exclude=['object']))
                    final_names.append(self._setup_outdir_individual()+"/per_strategy_summary_top")
                else:
                    final_dfs.append(top_df)
                    final_names.append(self._setup_outdir_individual()+"/ind_per_model_top")


            if self.combined_analysis.get():
                dfs = []
                for score_name in self._get_score_names_on():

                    score = self._get_score(score_name)
                    decoy_list = score.get_ordered_decoy_list_all(self.top_n.get())
                    df = combined_scores[combined_scores.index.isin(decoy_list)]
                    df["by_score_group"] = score_name

                    dfs.append(df)
                df = PandasDataFrame.drop_duplicate_columns(pandas.concat(dfs))
                name_order=["by_score_group"]
                name_order.extend(output_names)
                df = df[name_order]
                if summary:
                    #df.groupby(by="strategy").describe().to_csv(self._setup_outdir_combined()+"/com_summary_top_by_"+score.name+".csv")
                    df = df.apply(pandas.to_numeric, errors='ignore')

                    final_dfs.append(df.groupby(by=["by_score_group"]).describe(exclude=['object']))
                    final_names.append(self._setup_outdir_combined()+"/com_summary_top.csv")
                else:
                    final_dfs.append(df)
                    final_names.append(self._setup_outdir_combined()+"/com_per_model_top")


        else:

            if self.individual_analysis.get():
                combined_scores.sort_values(['strategy', 'dG'])
                if summary:

                    final_dfs.append(combined_scores.groupby(by="strategy").describe(exclude=['object']))
                    final_names.append(self._setup_outdir_individual()+"/per_strategy_summary_all.csv")
                else:
                    final_dfs.append(combined_scores)
                    final_names.append(self._setup_outdir_individual()+"/ind_per_model_all")

            if self.combined_analysis.get():
                combined_scores.sort_values(['dG'])
                if summary:
                    #combined_scores.groupby(by="strategy").describe().to_csv(self._setup_outdir_combined()+"/")
                    final_dfs.append(combined_scores.describe(exclude=['object']))
                    final_names.append(self._setup_outdir_combined()+"/com_summary_all")
                else:
                    final_dfs.append(combined_scores)
                    final_names.append(self._setup_outdir_combined()+"/com_per_model_all")

        #How to add Native Line?
        #Maybe a 'print native info' function...

        return final_dfs, final_names

[docs]    def output_stats(self):
        """
        Depracated in favor of dataframe summaries.
        """

        def output_all_stats():

            if len(self.strategies) == 0:
                print "No strategies set..."
                return

            if not self.out_dir_name.get():
                print "No root name set!"
                return

            outdir_stats = self._setup_outdir_combined(["stats"])
            outdir_lists = self._setup_outdir_combined(["ordered_pdb_lists"])

            scores = self._setup_scores("antibody", True)
            top_n = self.top_n.get()
            top_n_combined = self.top_n_combined.get()

            # Output Strategy data:
            for score_class in scores:
                if isinstance(score_class, DecoyData): pass

                if score_class.name == "dSASA":
                    reverse = True
                else:
                    reverse = False

                for strategy in self.strategies:
                    OUTFILE = open(outdir_lists + "/" + strategy + "_ORDERED_" + score_class.get_outname() + ".txt", 'w')
                    data = score_class.get_strategy_data(strategy, True)

                    for tup in sorted(data.keys(), reverse=reverse):
                        triple = data[tup]
                        OUTFILE.write(
                            get_str(triple.score) + "\t" + triple.strategy + "\t" + os.path.basename(triple.decoy) + "\n")
                    OUTFILE.close()


            # This is broken for some unkown reason. - Ends up skipping pretty much everything.

            # Output Combined Data:
            COMBINED = open(outdir_stats + "/combined_selection_data.txt", 'w')
            COMBINED.write("#strategy decoy " + " ".join([score_class.get_outname() for score_class in scores]) + "\n")
            total_scores = scores[0]
            all_data = total_scores.get_concatonated_map()
            for decoy in all_data:
                strategy = all_data[decoy].strategy
                line = strategy + " " + os.path.basename(decoy)
                # print strategy
                for score_class in scores:
                    if score_class.name == "combined_str_score" or score_class.name == "dG_top_Ptotal": continue

                    if not score_class.all_data[strategy].has_key(decoy):
                        print "Skipping " + strategy + " " + decoy
                        continue

                    line = line + " " + get_str(score_class.all_data[strategy][decoy].score)
                COMBINED.write(line + "\n")
            COMBINED.close()


            # Output Stats:
            STATS = open(outdir_stats + "/combined_selection_data_stats.txt", 'w')
            STATS.write("#strategy " + " ".join(
                [score_class.get_outname() + "_avg" + " " + score_class.get_outname() + "_sd " for score_class in
                 scores]) + "\n")

            for strategy in self.strategies:
                line = strategy
                for score_class in scores:
                    if not score_class.has_real_values() or score_class.name == "combined_str_score": continue
                    raw_scores_tuple = score_class.get_strategy_data(strategy, True)
                    raw_scores = [s[0] for s in raw_scores_tuple]
                    m = numpy.mean(raw_scores)
                    sd = numpy.std(raw_scores)
                    line = line + " %.3f" % m + " " + "%.3f" % sd
                STATS.write(line + "\n")
            STATS.close()

            # Output Top Stats:
            STATS = open(outdir_stats + "/combined_selection_data_top_" + repr(top_n) + "_stats.txt", 'w')
            STATS.write("#strategy " + " ".join(
                [score_class.get_outname() + "_avg" + " " + score_class.get_outname() + "_sd " for score_class in
                 scores]) + "\n")
            for strategy in self.strategies:
                line = strategy
                for score_class in scores:
                    if not score_class.has_real_values() or score_class.name == "combined_str_score": continue
                    raw_scores_tuple = score_class.get_top_strategy_data(strategy, top_n, True)
                    raw_scores = [s[0] for s in raw_scores_tuple]
                    m = numpy.mean(raw_scores)
                    sd = numpy.std(raw_scores)
                    line = line + " %.3f" % m + " " + "%.3f" % sd
                STATS.write(line + "\n")
            STATS.close()
            print "Complete"

        def output_score_extra_stats():

            top_n = self.top_n.get()
            main_scores = self._setup_scores()
            all_scores = self._setup_scores("antibody", True)

            if self.individual_analysis.get():
                out_dir = self._setup_outdir_individual(["raw_score_data"])

                # Top, then all
                for strategy in self.strategies:
                    for score in main_scores:
                        if score.name == "combined_str_score":
                            continue

                        if isinstance(score, DecoyData): pass

                        # Get TopN of that particular score
                        decoy_list = score.get_ordered_decoy_list(strategy, top_n)

                        outfile = out_dir + "/scores_of_top_" + score.name + "_" + strategy + ".txt"
                        print "writing " + outfile
                        OUT = open(outfile, 'w')

                        if not score.name == "dG_top_Ptotal":
                            header = "#decoy\t" + score.name
                        else:
                            header = "#decoy"

                        for a_score in all_scores:
                            if a_score.name == "combined_str_score" or a_score.name == "dG_top_Ptotal": continue
                            if a_score.name == score.name: continue

                            header = header + "\t" + a_score.name

                        OUT.write(header + "\n")

                        for decoy in decoy_list:
                            data = score.get_data_for_decoy(strategy, decoy)

                            if not score.name == "dG_top_Ptotal":
                                line = os.path.basename(decoy) + "\t" + get_str(data.score)
                            else:
                                line = os.path.basename(decoy)

                            for a_score in all_scores:
                                a_data = a_score.get_data_for_decoy(strategy, decoy)
                                if a_score.name == "combined_str_score": continue
                                if a_score.name == score.name: continue

                                line = line + "\t" + get_str(a_data.score)
                            OUT.write(line + "\n")
                        OUT.close()

                # All decoys

                score = main_scores[0]

                for strategy in self.strategies:
                    all_decoys = score.get_ordered_decoy_list(strategy)

                    out_file = out_dir + "/all_scores_" + strategy + ".txt"
                    print "writing " + out_file

                    OUT = open(out_file, 'w')

                    if not score.name == "dG_top_Ptotal":
                        header = "#decoy\t" + score.name
                    else:
                        header = "#decoy"

                    for a_score in all_scores:
                        if a_score.name == "combined_str_score" or a_score.name == "dG_top_Ptotal": continue
                        if a_score.name == score.name: continue

                        header = header + "\t" + a_score.name
                    OUT.write(header + "\n")

                    for decoy in all_decoys:
                        data = score.get_data_for_decoy(strategy, decoy)
                        if not score.name == "dG_top_Ptotal":

                            line = os.path.basename(decoy) + "\t" + get_str(data.score)
                        else:
                            line = os.path.basename(decoy)
                        for a_score in all_scores:
                            a_data = a_score.get_data_for_decoy(strategy, decoy)
                            if a_score.name == "combined_str_score": continue
                            if a_score.name == score.name: continue

                            line = line + "\t" + get_str(a_data.score)
                        OUT.write(line + "\n")
                    OUT.close()

        if self.individual_analysis.get():
            print "Outputting individual stats"
            output_score_extra_stats()
        if self.combined_analysis.get():
            print "Outputting combined stats"
            output_all_stats()
        print "Complete"

[docs]    def copy_top(self):
        def copy_top_strategy():

            top_n = self.top_n.get()
            scores = self._setup_scores()

            if not self.out_dir_name.get():
                print "No root name set!"
                return

            print "Copying Top Models.."
            # Each Strategy Top Scoring (Skip total score here for now):
            for strategy in self.strategies:
                for score in scores:
                    if score.name == "combined_str_score":
                        continue

                    out_dir = self._setup_outdir_individual(
                        ["pdbs_sessions", "top_" + repr(top_n) + "_" + score.get_outname() + "_" + strategy])
                    SCORELIST = open(out_dir + "/MODELS.txt", 'w')
                    print "Copying " + strategy + " " + score.get_outname() + " to: " + out_dir
                    if isinstance(score, DecoyData): pass

                    decoys = score.get_top_strategy_data(strategy, top_n)
                    decoy_list = score.get_ordered_decoy_list(strategy, top_n)
                    load_as = []
                    i = 1
                    for decoy in decoy_list:
                        load_as.append("model_" + repr(i) + "_" + score.get_outname() + "_" + get_str(decoys[decoy].score))
                        os.system('cp ' + decoy + " " + out_dir + "/" + "top_" + repr(i) + "_" + os.path.basename(decoy))
                        SCORELIST.write(
                            repr(i) + "\t" + get_str(decoys[decoy].score) + "\t" + os.path.basename(decoy) + "\n")
                        i += 1

                    if self.load_origin_pdbs:
                        if not self.pyigclassify_dir.get() or not os.path.exists(self.pyigclassify_dir.get()):
                            print "Origin PDB not set or does not exist. Disable this feature or set a correct directory."
                            return

                        make_pymol_session_on_top_ab_include_native_cdrs(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                         self.pyigclassify_dir.get()+"/"+self.cdr_rel_path,
                                                                         top_num=top_n, native_path=self.native_path)
                    else:
                        make_pymol_session_on_top(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                         top_num=top_n, native_path=self.native_path)
                    SCORELIST.close()

        def copy_top_combined():
            """
            Outputs total_score,
            """
            if not self.out_dir_name.get():
                print "No root name set!"
                return

            top_n = self.top_n_combined.get()
            scores = self._setup_scores()

            # Overall Strategy:
            for score in scores:
                if score.name == "combined_str_score":
                    continue

                if isinstance(score, DecoyData): pass
                outdir_top_pdbs = self._setup_outdir_combined(["top_structures", score.get_outname()])
                outdir_top_sessions = self._setup_outdir_combined(["top_sessions"])

                SCORELIST = open(outdir_top_pdbs + "/MODELS.txt", 'w')
                print "Copying " + score.get_outname() + " to: " + outdir_top_pdbs
                decoys = score.get_top_all_data(top_n)
                decoy_list = score.get_ordered_decoy_list_all(top_n)
                load_as = []
                i = 1
                for decoy in decoy_list:
                    load_as.append("model_" + repr(i) + "_" + score.get_outname() + "_" + get_str(decoys[decoy].score))
                    os.system('cp ' + decoy + " " + outdir_top_pdbs + "/top_" + repr(i) + "_" + os.path.basename(decoy))
                    SCORELIST.write(repr(i) + "\t" + get_str(decoys[decoy].score) + "\t" + os.path.basename(decoy) + "\n")
                    i += 1

                if self.load_origin_pdbs:
                    if not self.pyigclassify_dir.get() or not os.path.exists(self.pyigclassify_dir.get()):
                        print "Origin PDB not set or does not exist. Disable this feature or set a correct directory."
                        return

                    make_pymol_session_on_top_ab_include_native_cdrs(decoy_list, load_as, outdir_top_pdbs, outdir_top_sessions,
                                                                     score.get_outname(), self.pyigclassify_dir.get()+"/"+self.cdr_rel_path,
                                                                     top_num=top_n, native_path=self.native_path)
                else:
                    make_pymol_session_on_top(decoy_list, load_as, outdir_top_pdbs, outdir_top_sessions, score.get_outname(),
                                                                     top_num=top_n, native_path=self.native_path)

                SCORELIST.close()

        if self.individual_analysis.get():
            print "Outputting individual sessions"
            copy_top_strategy()
        if self.combined_analysis.get():
            print "Outputting combined sessions"
            copy_top_combined()

[docs]    def copy_all_models(self):

        if not self.out_dir_name.get():
            print "No root name set!"
            return

        if len(self.strategies) == 0:
            print "No strategies set..."
            return


        # outdir = self._setup_outdir()
        scores = self._setup_scores()

        # Each Strategy Top Scoring (Skip total score here for now):
        for strategy in self.strategies:
            for score in scores[1:]:
                out_dir = self._setup_outdir_individual(["all" + "_" + score.get_outname() + "_" + strategy], False)
                print "Copying " + strategy + " " + score.get_outname() + " to: " + out_dir
                if isinstance(score, DecoyData): pass

                decoys = score.get_strategy_data(strategy)
                decoy_list = score.get_ordered_decoy_list(strategy)
                load_as = []
                i = 1
                for decoy in decoy_list:
                    load_as.append("model_" + repr(i) + "_" + score.get_outname() + "_" + get_str(decoys[decoy].score))
                    os.system('cp ' + decoy + " " + out_dir + "/top_" + repr(i) + "_" + os.path.basename(decoy))
                    i += 1

                if self.load_origin_pdbs:
                    if not self.pyigclassify_dir.get() or not os.path.exists(self.pyigclassify_dir.get()):
                        print "Origin PDB not set or does not exist. Disable this feature or set a correct directory."
                        return

                    make_pymol_session_on_top_ab_include_native_cdrs(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                     self.pyigclassify_dir.get()+"/"+self.cdr_rel_path,
                                                                     top_num=None, native_path=self.native_path)
                else:
                    make_pymol_session_on_top(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                     top_num=None, native_path=self.native_path)

        # Overall Strategy:
        for score in scores:
            if isinstance(score, DecoyData): pass
            out_dir = self._setup_outdir_combined(["all_structures", score.get_outname()])
            print "Copying " + score.get_outname() + " to: " + out_dir
            decoys = score.get_concatonated_map()
            decoy_list = score.get_ordered_decoy_list_all()
            load_as = []
            i = 1
            for decoy in decoy_list:
                load_as.append("model_" + repr(i) + "_" + score.get_outname() + "_" + get_str(decoys[decoy].score))
                os.system('cp ' + decoy + " " + out_dir + "/top_" + repr(i) + "_" + os.path.basename(decoy))
                i += 1
            out_dir = self._setup_outdir_combined(["all_sessions"])

            if self.load_origin_pdbs:
                if not self.pyigclassify_dir.get() or not os.path.exists(self.pyigclassify_dir.get()):
                    print "Origin PDB not set or does not exist. Disable this feature or set a correct directory."
                    return

                make_pymol_session_on_top_ab_include_native_cdrs(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                 self.pyigclassify_dir.get()+"/"+self.cdr_rel_path,
                                                                 top_num=None, native_path=self.native_path)
            else:
                make_pymol_session_on_top(decoy_list, load_as, out_dir, out_dir, score.get_outname(),
                                                                 top_num=None, native_path=self.native_path)

[docs]    def run_clustal_omega(self, processors, output_format="fasta", extra_options=""):

        def run_clustal_omega_on_strategies():

            if not self.out_dir_name.get():
                print "No root name set!"
                return

            scores = self._setup_scores()
            top_n = self.top_n.get()

            for strategy in self.strategies:
                print "\nRunning Clustal on: " + strategy
                score_zero = scores[0]
                if isinstance(score_zero, DecoyData): pass

                # Output per strategy ALL
                decoys = score_zero.get_strategy_data(strategy)
                decoy_header_dict = defaultdict()

                for decoy in decoys:
                    decoy_header_dict[decoy] = os.path.basename(decoy)

                fasta_dir = self._setup_outdir_individual(["sequences"])
                fasta_path = fasta_dir + "/" + strategy + "_all.fasta"
                fasta.output_fasta_from_pdbs_biopython(decoy_header_dict, fasta_path, self.native_path, "Native",
                                                       self.is_camelid.get())

                aln_dir = self._setup_outdir_individual(["clustal"])
                aln_name = strategy + "_all.clus"
                clustal_runner = ClustalRunner(fasta_path)
                clustal_runner.set_hard_wrap(self.clustal_soft_wrap.get())
                clustal_runner.set_threads(processors)
                clustal_runner.set_extra_options(extra_options)
                clustal_runner.set_output_format(output_format)
                clustal_runner.output_alignment(aln_dir, aln_name)

                # Output on Top Scoring:

                for score in scores:
                    print score
                    if isinstance(score, DecoyData): pass

                    basename = strategy + "_" + score.get_outname() + "_top_" + str(top_n)
                    fasta_path = fasta_dir + "/" + basename + ".fasta"
                    aln_name = basename + ".clus"

                    decoy_header_dict = defaultdict()
                    decoys = score.get_top_strategy_data(strategy, top_n)
                    for decoy in decoys:
                        header = "v" + get_str(decoys[decoy].score) + "::" + os.path.basename(decoy)
                        decoy_header_dict[decoy] = header
                    fasta.output_fasta_from_pdbs_biopython(decoy_header_dict, fasta_path, self.native_path, "Native",
                                                           self.is_camelid.get())
                    clustal_runner.set_fasta_path(fasta_path)
                    clustal_runner.set_threads(processors)
                    clustal_runner.set_extra_options(extra_options)
                    clustal_runner.set_output_format(output_format)
                    clustal_runner.output_alignment(aln_dir, aln_name)

            print "Complete"

        def run_clustal_omega_on_top_combined():

            if not self.out_dir_name.get():
                print "No root name set!"
                return

            scores = self._setup_scores()
            top_n = self.top_n.get()

            for score in scores:
                print "Running Clustal Omega on " + score.get_outname()
                if isinstance(score, DecoyData): pass
                decoys = score.get_top_all_data(top_n, False)

                decoy_header_dict = defaultdict()
                for decoy in decoys:
                    header = "v" + get_str(decoys[decoy].score) + "::" + os.path.basename(decoy)
                    decoy_header_dict[decoy] = header

                root_name = self.out_dir_name.get()
                basename = root_name + "_" + score.get_outname() + "_top_" + repr(top_n)
                fasta_dir = self._setup_outdir_combined(["sequences"])
                fasta_path = fasta_dir + "/" + basename + ".fasta"

                clustal_dir = self._setup_outdir_combined(["clustal"])
                clustal_name = basename + ".aln"

                fasta.output_fasta_from_pdbs_biopython(decoy_header_dict, fasta_path, self.native_path, "Native",
                                                       self.is_camelid.get())
                clustal_runner = ClustalRunner(fasta_path)
                clustal_runner.set_threads(processors)
                clustal_runner.set_hard_wrap(self.clustal_soft_wrap.get())
                clustal_runner.set_extra_options(extra_options)
                clustal_runner.set_output_format(output_format)
                clustal_runner.output_alignment(clustal_dir, clustal_name)

        if self.individual_analysis.get():
            run_clustal_omega_on_strategies()
        if self.combined_analysis.get():
            run_clustal_omega_on_top_combined()

        print "Complete"

[docs]    def run_clustal_omega_on_all_combined(self, processors, output_format, extra_options=""):

        if not self.out_dir_name.get():
            print "No root name set!"
            return

        print "Running Clustal on All Combined"
        scores = self._setup_scores()
        score = scores[0]
        if isinstance(score, DecoyData): pass

        root_name = self.out_dir_name.get()
        fasta_dir = self._setup_outdir_combined(["sequences"])
        fasta_path = fasta_dir + "/" + root_name + "_all.fasta"

        clustal_dir = self._setup_outdir_combined(["clustal"])
        clustal_name = root_name + "_all.aln"

        all_data = score.get_concatonated_map(False)

        all_data_array = []
        for s in scores:
            if s.name == "combined_str_score":
                continue
            all_data_array.append(s.get_concatonated_map(False))

        decoy_header_dict = defaultdict()
        for decoy in all_data:
            a = all_data_array[0]
            header = "v" + get_str(a[decoy].score)

            for a in all_data_array[1:]:
                header = header + ":" + get_str(a[decoy].score)
            header = header + ":" + os.path.basename(decoy)
            decoy_header_dict[decoy] = header

        fasta.output_fasta_from_pdbs_biopython(decoy_header_dict, fasta_path, self.native_path, "native",
                                               self.is_camelid.get())
        clustal_runner = ClustalRunner(fasta_path)
        #clustal_runner.set_threads(processors)
        clustal_runner.set_hard_wrap(self.clustal_soft_wrap.get())
        clustal_runner.set_extra_options(extra_options)
        clustal_runner.set_output_format(output_format)
        clustal_runner.output_alignment(clustal_dir, clustal_name)

        print "Complete"

[docs]    def output_len_or_clus_alignment(self, alignment_type, features_type='antibody'):

        is_camelid = self.is_camelid.get()

        top_n = self.top_n.get()
        data_class = self._setup_cdr_types(alignment_type, self.is_camelid.get(), features_type)
        len_class = self._setup_cdr_types("length", self.is_camelid.get(), features_type)

        def _output_alignment(self, outdir, top_decoys, score, type_data, strategy=None, extra_name="top"):

            if isinstance(type_data, CDRData): pass
            if isinstance(score, DecoyData): pass

            if not strategy:
                outname = "cdr_type_alignments_" + alignment_type + "_" + score.get_outname() + "_" + extra_name + ".txt"
            else:
                outname = "cdr_type_alignments_" + alignment_type + "_" + score.get_outname() + "_" + strategy + "_" + extra_name + "_.txt"

            print "Outputting cdr type alignment: " + outdir + "/" + outname

            cdr_names = [cdr for cdr in type_data.cdrs if cdr in self._setup_cdrs()]

            OUTFILE = open(outdir + "/" + outname, 'w')
            if self.native_path:
                header = "#score\t\tmatches"
            else:
                header = "#score\t"
            for cdr in cdr_names:
                header = header + "\t\t" + cdr

            header += "\tdecoy"
            OUTFILE.write(header + "\n")

            all_data = score.get_concatonated_map()
            all_type_data = type_data.get_concatonated_map()

            if self.native_path:
                line = "..." + "\t\tNA"

                if all_type_data.has_key((cdr, "native")):
                    native_info = type_data[(cdr, "native")]
                else:
                    native_info = type_data.get_native_data()

                if isinstance(native_info, CDRDataInfo): pass

                for cdr in cdr_names:
                    info = str(native_info.get_value_for_cdr(cdr))
                    if info == "NA":
                        info = cdr + "-" + str(len_class.get_native_data().get_value_for_cdr(cdr)) + "-NA"
                    line = line + "\t\t" + str(info)
                line = line + "\tnative"
                OUTFILE.write(line + "\n")

                for decoy in top_decoys:
                    decoy_info = all_type_data[decoy]
                    score_info = all_data[decoy]
                    counts = count_native_matches(decoy_info, native_info, type_data.cdrs)
                    line = get_str(score_info.score) + "\t\t" + repr(counts)
                    for cdr in cdr_names:
                        line = line + "\t\t" + str(get_star_if_native(decoy_info, native_info, cdr))

                    line = line + "\t" + os.path.basename(decoy)
                    OUTFILE.write(line + "\n")
            else:
                for decoy in top_decoys:
                    print decoy
                    decoy_info = all_type_data[decoy]
                    score_info = all_data[decoy]
                    print repr(score_info)
                    print repr(score_info.score)

                    line = get_str(score_info.score)
                    for cdr in cdr_names:

                        info = str(decoy_info.get_value_for_cdr(cdr))
                        if info == "NA":
                            info = cdr + "-" + str(
                                len_class.get_concatonated_map()[decoy].get_value_for_cdr(cdr)) + "-NA"
                        line = line + "\t\t" + str(info)

                    line = line + "\t" + os.path.basename(decoy)
                    OUTFILE.write(line + "\n")

            OUTFILE.close()

        if not self.out_dir_name.get():
            print "No root name set!"
            return

        scores = self._setup_scores(features_type)
        if isinstance(data_class, CDRData): pass

        ### Top/All  each Strategy
        if self.individual_analysis.get():
            outdir = self._setup_outdir_individual(['cdr_alignments'])
            for strategy in self.strategies:

                for score in scores:
                    if isinstance(score, DecoyData): pass
                    top_decoys = score.get_ordered_decoy_list(strategy, top_n)
                    all_decoys = score.get_ordered_decoy_list(strategy)
                    # print "Top: "+repr(top_decoys)

                    print "Working on : " + strategy + " " + score.name
                    _output_alignment(self, outdir, top_decoys, score, data_class, strategy, "top")
                    _output_alignment(self, outdir, all_decoys, score, data_class, strategy, "all")


        ### Top/All each combined
        if self.combined_analysis.get():
            outdir = self._setup_outdir_combined(['cdr_alignments'])
            for score in scores:
                top_decoys = score.get_ordered_decoy_list_all(top_n)
                all_decoys = score.get_ordered_decoy_list_all()

                print "Top: " + repr(len(top_decoys)) + score.name
                _output_alignment(self, outdir, top_decoys, score, data_class, None, "top")
                _output_alignment(self, outdir, all_decoys, score, data_class, None, "all")

        print "Complete"

[docs]    def output_len_or_clus_recovery(self, alignment_type, features_type='antibody'):
        len_class = self._setup_cdr_types("length", self.is_camelid.get(), features_type)
        if not self.native_path:
            print "Must pass select native path to calculate recoveries"
            return

        def _get_header(self, type_data):
            header = "#avg"
            cdr_names = [cdr for cdr in type_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                header = header + "\t\t" + cdr
            header += "\tname"
            return header

        def _add_native_line(self, native_info, type_data, OUTFILE):
            line = "NA"

            if isinstance(native_info, CDRDataInfo): pass

            cdr_names = [cdr for cdr in type_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                info = str(native_info.get_value_for_cdr(cdr))
                if info == "NA":
                    info = cdr + "-" + str(len_class.get_native_data().get_value_for_cdr(cdr)) + "-NA"
                line = line + "\t\t" + str(info)
            line += "\tnative"
            OUTFILE.write(line + "\n")

        def _add_recovery_line(self, label, decoys, type_data, OUTFILE, strategy=None):

            if isinstance(type_data, CDRData): pass

            total = 0
            cdr_names = [cdr for cdr in type_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                enrichment_data = calculate_recovery(type_data.get_native_data(), type_data, cdr, decoys)
                total = total + enrichment_data.get_perc_decimal()


            native_data = type_data.get_native_data()

            # If native is NA, do not count the recovery against it for averages.  Only average over set CDRs.
            t=0
            for cdr in cdr_names:
                if not native_data.get_value_for_cdr(cdr) =="NA":
                    t+=1

            avg = total / t

            line = "%.3f" % avg
            for cdr in cdr_names:
                enrichment_data = calculate_recovery(type_data.get_native_data(), type_data, cdr, decoys)

                line = line + "\t\t%.3f" % enrichment_data.get_perc_decimal()

            line += "\t"+label
            OUTFILE.write(line + "\n")

        if not self.out_dir_name.get():
            print "No root name set!"
            return

        top_n = self.top_n.get()

        data_class = self._setup_cdr_types(alignment_type, self.is_camelid.get(), features_type)
        scores = self._setup_scores(features_type)
        if isinstance(data_class, CDRData): pass

        # Each strategy + Top Values
        if self.individual_analysis.get():
            outdir = self._setup_outdir_individual(['enrichment'])
            OUTFILE = open(outdir + "/" + "cdr_type_recoveries_" + alignment_type + "_.txt", 'w')
            OUTFILE.write(_get_header(self, data_class) + "\n")
            _add_native_line(self, data_class.get_native_data(), data_class, OUTFILE)
            for strategy in self.strategies:
                outname = strategy
                decoys = scores[0].get_strategy_data(strategy).keys()
                _add_recovery_line(self, outname, decoys, data_class, OUTFILE, strategy)
                for score in scores:
                    if isinstance(score, DecoyData): pass
                    if score.name == "combined_str_score": continue

                    top_decoys = score.get_ordered_decoy_list(strategy, top_n)
                    outname = strategy + "_" + score.get_outname() + "_top_" + repr(top_n)
                    _add_recovery_line(self, outname, top_decoys, data_class, OUTFILE, strategy)
            OUTFILE.close()


        # Combined + Top Values
        if self.combined_analysis.get():
            outdir = self._setup_outdir_combined(['enrichment'])
            OUTFILE = open(outdir + "/" + "cdr_type_recoveries_all_" + alignment_type + "_.txt", 'w')
            OUTFILE.write(_get_header(self, data_class) + "\n")
            _add_native_line(self, data_class.get_native_data(), data_class, OUTFILE)
            outname = "combined_all"
            _add_recovery_line(self, outname, scores[0].get_concatonated_map().keys(), data_class, OUTFILE)
            for score in scores:
                if score.name == "combined_str_score": continue
                top_decoys = score.get_ordered_decoy_list_all(top_n)
                outname = "combined_" + score.get_outname() + "_top_" + repr(top_n)
                # print "Top: "+repr(top_decoys)
                _add_recovery_line(self, outname, top_decoys, data_class, OUTFILE)
            OUTFILE.close()
        print "Complete"

[docs]    def output_len_or_clus_enrichment(self, alignment_type, features_type='antibody'):


        def _add_enrichments(self, label, decoys, type_data):
            if isinstance(data_class, CDRData): pass
            #OUTFILE = open(outdir + "/" + "cdr_type_enrichment_" + alignment_type + "_.txt", 'w')

            cdr_names = [cdr for cdr in type_data.cdrs if cdr in self._setup_cdrs()]
            for cdr in cdr_names:
                enrichments = calculate_enrichments(data_class, cdr, decoys)
                OUTFILE = open(outdir+"/" + "cdr_type_enrichment_"+alignment_type+"_"+label+"_"+cdr+".txt", 'w')
                OUTFILE.write("#group\tcount\tperc\n")


                for c in sorted([[enrichments[c].count, c] for c in enrichments], reverse=True):
                    t = c[1]; #c[0] is the actual counts we are sorting on.
                    perc = enrichments[t]
                    if isinstance(perc, Perc): pass

                    p = perc.get_perc_decimal()
                    OUTFILE.write(str(t)+"\t\t"+str(perc.count)+"\t\t"+perc.get_formated_perc(p)+"\n")

                OUTFILE.close()


        ##Individual Enrichments

        top_n = self.top_n.get()
        data_class = self._setup_cdr_types(alignment_type, self.is_camelid.get(), features_type)

        scores = self._setup_scores(features_type)

        if self.individual_analysis.get():
            outdir = self._setup_outdir_individual(["enrichment"])
            if isinstance(data_class, CDRData): pass



            for strategy in self.strategies:
                outname = strategy
                decoys = scores[0].get_strategy_data(strategy).keys()
                _add_enrichments(self, outname, decoys, data_class)

                for score in scores:
                    if isinstance(score, DecoyData): pass
                    if score.name == "combined_str_score": continue

                    top_decoys = score.get_ordered_decoy_list(strategy, top_n)
                    outname = strategy + "_" + score.get_outname() + "_top_" + repr(top_n)
                    _add_enrichments(self, outname, top_decoys, data_class)


        #Combined Enrichments
        if self.combined_analysis.get():
            outdir = self._setup_outdir_combined(["enrichment"])
            outname = "combined_all"
            _add_enrichments(self, outname, scores[0].get_concatonated_map().keys(), data_class)
            for score in scores:
                if score.name == "combined_str_score": continue
                top_decoys = score.get_ordered_decoy_list_all(top_n)
                outname = "combined_" + score.get_outname() + "_top_" + repr(top_n)
                # print "Top: "+repr(top_decoys)
                _add_enrichments(self, outname, top_decoys, data_class)

        print "Complete"


[docs]    def create_score_subset_database(self, score_name, prefix, features_type='antibody'):
        self._setup_scores()
        score = self._get_score(score_name)
        if not isinstance(score, DecoyData):
            print "Score type not found!"
            return

        if not self.out_dir_name.get():
            print "No root name set!"
            return

        top_n = self.top_n.get()
        fdir = os.path.split(os.path.abspath(__file__))[0] + "/xml_scripts"

        for strategy in self.strategies:
            temp_name = "temp_PDBLIST.txt"
            OUTFILE = open(temp_name, 'w')
            decoys = score.get_ordered_decoy_list(strategy, top_n)
            for decoy in decoys:
                OUTFILE.write(decoy + "\n")
            OUTFILE.close()

            out_db_name = prefix + "_" + strategy
            out_db_batch = "Subset"

            # This should be done manually by getting struct_id and copying all the data in a new database via python sqlite3
            # I do not have time to figure that out and get it working right now, so this will have to do.

            analyze_strat.create_features_db(temp_name, fdir, features_type + "_features", self.rosetta_extension.get(),
                                             self.scorefxn, out_db_name, out_db_batch, self.db_dir.get(), False)

            os.remove(temp_name)

[docs]class Perc:
    """
    Simple class for holding enrichment/recovery information
    """

    def __init__(self, count, total):
        self.count = count
        self.total = total
        self._calc_per()

    def _calc_per(self):
        self.perc = self.count / float(self.total)

[docs]    def get_count(self):
        return self.count

[docs]    def get_total(self):
        return self.total

[docs]    def get_perc_decimal(self):
        return self.perc

[docs]    def get_perc_whole(self):
        return self.perc * 100

[docs]    def get_formated_perc(self, perc):
        return "%.3f" % perc


########################################################################################################################
### Helper Functions
########################################################################################################################
[docs]def get_str(value):
    if type(value) == str:
        return value
    else:
        return "%.3f" % value


[docs]def calculate_recovery(native_data, all_decoy_data, cdr, decoy_list=None):
    """
    Calculate the recovery of some value to native
    Returns
    """
    if isinstance(native_data, CDRDataInfo): pass
    if isinstance(all_decoy_data, CDRData): pass

    raw_decoy_map = all_decoy_data.get_concatonated_map()
    if not decoy_list:
        decoy_list = raw_decoy_map.keys()

    count = 0
    for decoy in decoy_list:
        cdr_info = raw_decoy_map[decoy]
        if isinstance(cdr_info, CDRDataInfo): pass
        if cdr_info.get_value_for_cdr(cdr) == native_data.get_value_for_cdr(cdr):
            count += 1

    enrich_info = Perc(count, len(decoy_list))
    return enrich_info

[docs]def calculate_enrichments(all_decoy_data, cdr, decoy_list = None):
    """
    Returns defaultdict of [count_type] : Perc
    """

    if isinstance(all_decoy_data, CDRData): pass

    raw_decoy_map = all_decoy_data.get_concatonated_map()
    if not decoy_list:
        decoy_list = raw_decoy_map.keys()

    raw_counts = defaultdict(int)
    final_counts = defaultdict()

    for decoy in decoy_list:

        cdr_info = raw_decoy_map[decoy]
        if isinstance(cdr_info, CDRDataInfo): pass
        raw_counts[cdr_info.get_value_for_cdr(cdr)]+=1

    for count_type in raw_counts:
        final_counts[count_type] = Perc(raw_counts[count_type], len(decoy_list))

    return final_counts

[docs]def calculate_observed_value(value, all_decoy_data, cdr, decoy_list=None):
    """
    Calculate the enrichment of some value
    """
    if isinstance(all_decoy_data, CDRData): pass

    raw_decoy_map = all_decoy_data.get_concatonated_map()
    if not decoy_list:
        decoy_list = raw_decoy_map.keys()

    count = 0
    for decoy in decoy_list:
        cdr_info = raw_decoy_map[decoy]
        if isinstance(cdr_info, CDRDataInfo): pass
        if cdr_info.get_value_for_cdr(cdr) == value:
            count += 1

    enrich_info = Perc(count, len(decoy_list))
    return enrich_info


[docs]def count_native_matches(decoy_data, native_data, cdrs):
    if isinstance(decoy_data, CDRDataInfo): pass
    if isinstance(native_data, CDRDataInfo): pass

    count = 0
    for cdr in cdrs:
        if native_data.get_value_for_cdr(cdr) == decoy_data.get_value_for_cdr(cdr):
            count += 1
    return count


[docs]def get_star_if_native(decoy_data, native_data, cdr):
    if native_data.get_value_for_cdr(cdr) == decoy_data.get_value_for_cdr(cdr):
        return "*"
    else:
        return decoy_data.get_value_for_cdr(cdr)