Source code for jade.basic.pandas.PandasDataFrame

import pandas as pd
import os



[docs]class GeneralPandasDataFrame(pd.DataFrame):
    def __init__(self, data=None, index=None, columns=None, dtype=None,copy=False):
        pd.DataFrame.__init__(data=data, index=index, columns=columns, dtype=dtype,copy=copy)

[docs]    def drop_duplicate_columns(self):
        """
        Drop Duplicate columns from the DataFrame in place
        :return:
        """

        #I'm not sure how to do this inplace, without reassigning self.  If you know, please edit this.


        self = self.T.groupby(level=0).first().T

[docs]    def detect_numeric(self):
        self = self.infer_objects()

[docs]    def get_columns(self, columns):
        return self[columns]

[docs]    def get_matches(self, column, to_match):
        """
        Get all the rows that match a paricular element of a column.
        :param column: str
        :param to_match: str
        :rtype: pandas.DataFrame
        """

        return self[self[column] == to_match]

[docs]    def get_row_matches(self, column1, to_match, column2):
        """
        Get the elements of the rows that match a particular column.  If one element, this can be converted easily enough
        :param column1: str
        :param to_match: str
        :param column2: str
        :rtype: pandas.Series
        """

        return self[self[column1] == to_match][column2]

[docs]    def n_matches(self, column, to_match):
        """
        Return the number of matches.
        :param column: str
        :param to_match: str
        :rtype: int
        """
        return len(get_matches(column, to_match))

[docs]    def to_tsv(self, path_or_buf=None, na_rep='', float_format=None,
               columns=None, header=True, index=True, index_label=None,
               mode='w', encoding=None, compression=None, quoting=None,
               quotechar='"', line_terminator='\n', chunksize=None,
               tupleize_cols=False, date_format=None, doublequote=True,
               escapechar=None, decimal='.'):
        self.to_csv(sep = "\t", path_or_buf=path_or_buf, na_rep=na_rep, float_format=float_format,
               columns=columns, header=header, index=index, index_label=index_label,
               mode=mode, encoding=encoding, compression=compression, quoting=quoting,
               quotechar=quotechar, line_terminator=line_terminator, chunksize=chunksize,
               tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote,
               escapechar=escapechar, decimal=decimal)

[docs]def multi_tab_excel(df_list, sheet_list, file_name):
    """
    Writes multiple dataframes as separate sheets in an output excel file.

    If directory of output does not exist, it will create it.

    Author: Tom Dobbs
    http://stackoverflow.com/questions/32957441/putting-many-python-pandas-dataframes-to-one-excel-worksheet


    :param df_list: [pd.Dataframe]
    :param sheet_list: [str]
    :param file_name: str

    """
    if not os.path.exists(os.path.dirname(file_name)):
        os.mkdir(os.path.dirname(file_name))

    writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
    for dataframe, sheet in zip(df_list, sheet_list):
        dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0)
    writer.save()


[docs]def drop_duplicate_columns(df):
    """
    Drop Duplicate columns from the DataFrame.
    Return DF

    :param df: pandas.DataFrame
    :rtype: pandas.DataFrame
    """
    return df.T.groupby(level=0).first().T

[docs]def detect_numeric(df):
    """
    Detect numeric components

    :param df: pd.DataFrame
    :rtype: pd.DataFrame

    """
    #return df.convert_objects(convert_numeric=True)
    return df.infer_objects()

[docs]def get_columns(df, columns):
    """
    Get a new dataframe of only the columns

    :param df: pandas.DataFrame
    :param columns: list
    :rtype: pd.DataFrame
    """
    return df[columns]

[docs]def get_matches(df, column, to_match):
    """
    Get all the rows that match a paricular element of a column.

    :param df: pandas.DataFrame
    :param column: str
    :param to_match: str
    :rtype: pd.DataFrame
    """

    return df[df[column] == to_match]

[docs]def get_multiple_matches(df, column, to_match_array):
    """
    Get all the rows that match any of the values in to_match_array.

    :param df: pandas.DataFrame
    :param column: str
    :param to_match_array: list
    :rtype: pd.DataFrame
    """
    return df[df[column].isin(to_match_array)]

[docs]def get_match_by_array(df, column, match_array):
    """
    Get a new dataframe of all dataframes of the subset series, match_array

    Note: This will result in a dataframe, but there may be strange issues when you go to plot the data in seaborn
            No idea why.

    :param df: pd.DataFrame
    :param column: str
    :param match_array: pd.Series
    :rtype: pd.DataFrame
    """

    new_df = df[df[column].isin(match_array)]
    return new_df


[docs]def get_row_matches(df, column1, to_match, column2):
    """
    Get the elements of the rows that match a particular column.  If one element, this can be converted easily enough
    :param df: pd.DataFrame
    :param column1: str
    :param to_match: str
    :param column2: str
    :rtype: pd.Series
    """

    return df[df[column1] == to_match][column2]

[docs]def get_value(df, column):
    """
    Get a single value from a one-row df.  THis is to help for implicit docs, since the syntax to Iloc is so fucking strange.

    :param df: pd.DataFrame
    :param column: str
    :return: value
    """
    return df.iloc[0][column]

[docs]def get_n_matches(df, column, to_match):
    """
    Get the number of matches
    :param df: pd.DataFrame
    :param column: str
    :param to_match:
    :rtype: int 
    """
    return len(get_matches(df, column, to_match))

[docs]def sort_on_list(df, column, sort_order):
    """
    Given a list of values, and a column, create a new dataframe that is sorted like so. 
    No idea why this is so difficult.
    :param df: 
    :param list_to_sort: 
    :rtype: pd.DataFrame 
    """
    # Sort:
    sep = []
    for o in sort_order:
        sep.append(df[df['id'].isin([o])])
    return pd.concat(sep).reset_index()