Source code for jade.basic.pandas.PandasDataFrame

import pandas as pd
import os



[docs]class GeneralPandasDataFrame(pd.DataFrame): def __init__(self, data=None, index=None, columns=None, dtype=None,copy=False): pd.DataFrame.__init__(data=data, index=index, columns=columns, dtype=dtype,copy=copy)
[docs] def drop_duplicate_columns(self): """ Drop Duplicate columns from the DataFrame in place :return: """ #I'm not sure how to do this inplace, without reassigning self. If you know, please edit this.
self = self.T.groupby(level=0).first().T
[docs] def detect_numeric(self):
self = self.infer_objects()
[docs] def get_columns(self, columns):
return self[columns]
[docs] def get_matches(self, column, to_match): """ Get all the rows that match a paricular element of a column. :param column: str :param to_match: str :rtype: pandas.DataFrame """
return self[self[column] == to_match]
[docs] def get_row_matches(self, column1, to_match, column2): """ Get the elements of the rows that match a particular column. If one element, this can be converted easily enough :param column1: str :param to_match: str :param column2: str :rtype: pandas.Series """
return self[self[column1] == to_match][column2]
[docs] def n_matches(self, column, to_match): """ Return the number of matches. :param column: str :param to_match: str :rtype: int """
return len(get_matches(column, to_match))
[docs] def to_tsv(self, path_or_buf=None, na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.'): self.to_csv(sep = "\t", path_or_buf=path_or_buf, na_rep=na_rep, float_format=float_format, columns=columns, header=header, index=index, index_label=index_label, mode=mode, encoding=encoding, compression=compression, quoting=quoting, quotechar=quotechar, line_terminator=line_terminator, chunksize=chunksize, tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote,
escapechar=escapechar, decimal=decimal)
[docs]def multi_tab_excel(df_list, sheet_list, file_name): """ Writes multiple dataframes as separate sheets in an output excel file. If directory of output does not exist, it will create it. Author: Tom Dobbs http://stackoverflow.com/questions/32957441/putting-many-python-pandas-dataframes-to-one-excel-worksheet :param df_list: [pd.Dataframe] :param sheet_list: [str] :param file_name: str """ if not os.path.exists(os.path.dirname(file_name)): os.mkdir(os.path.dirname(file_name)) writer = pd.ExcelWriter(file_name,engine='xlsxwriter') for dataframe, sheet in zip(df_list, sheet_list): dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0)
writer.save()
[docs]def drop_duplicate_columns(df): """ Drop Duplicate columns from the DataFrame. Return DF :param df: pandas.DataFrame :rtype: pandas.DataFrame """
return df.T.groupby(level=0).first().T
[docs]def detect_numeric(df): """ Detect numeric components :param df: pd.DataFrame :rtype: pd.DataFrame """ #return df.convert_objects(convert_numeric=True)
return df.infer_objects()
[docs]def get_columns(df, columns): """ Get a new dataframe of only the columns :param df: pandas.DataFrame :param columns: list :rtype: pd.DataFrame """
return df[columns]
[docs]def get_matches(df, column, to_match): """ Get all the rows that match a paricular element of a column. :param df: pandas.DataFrame :param column: str :param to_match: str :rtype: pd.DataFrame """
return df[df[column] == to_match]
[docs]def get_multiple_matches(df, column, to_match_array): """ Get all the rows that match any of the values in to_match_array. :param df: pandas.DataFrame :param column: str :param to_match_array: list :rtype: pd.DataFrame """
return df[df[column].isin(to_match_array)]
[docs]def get_match_by_array(df, column, match_array): """ Get a new dataframe of all dataframes of the subset series, match_array Note: This will result in a dataframe, but there may be strange issues when you go to plot the data in seaborn No idea why. :param df: pd.DataFrame :param column: str :param match_array: pd.Series :rtype: pd.DataFrame """ new_df = df[df[column].isin(match_array)]
return new_df
[docs]def get_row_matches(df, column1, to_match, column2): """ Get the elements of the rows that match a particular column. If one element, this can be converted easily enough :param df: pd.DataFrame :param column1: str :param to_match: str :param column2: str :rtype: pd.Series """
return df[df[column1] == to_match][column2]
[docs]def get_value(df, column): """ Get a single value from a one-row df. THis is to help for implicit docs, since the syntax to Iloc is so fucking strange. :param df: pd.DataFrame :param column: str :return: value """
return df.iloc[0][column]
[docs]def get_n_matches(df, column, to_match): """ Get the number of matches :param df: pd.DataFrame :param column: str :param to_match: :rtype: int """
return len(get_matches(df, column, to_match))
[docs]def sort_on_list(df, column, sort_order): """ Given a list of values, and a column, create a new dataframe that is sorted like so. No idea why this is so difficult. :param df: :param list_to_sort: :rtype: pd.DataFrame """ # Sort: sep = [] for o in sort_order: sep.append(df[df['id'].isin([o])])
return pd.concat(sep).reset_index()