Pandas DL Code Snippet

2 minute read

Code Snippet

Label Encoder

from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()

for f in col_enc:
    if total_df[f].dtype=='object':
        print(f)
        lbl = LabelEncoder()
        lbl.fit(list(total_df[f].values.astype('str')))
        total_df[f] = lbl.transform(list(total_df[f].values.astype('str')))
        total_df[f] = total_df[f].astype('category')

for col in col_enc:
    ce = total_df[col].value_counts()
    total_df[col+'_Freq'] = total_df[col].map(ce)
    #print(col)

Count Encoding with Value Counts Map

for col in total_df.columns:
    ce = total_df[col].value_counts()
    total_df[col+'_Freq'] = total_df[col].map(ce)
    #print(col)

Missing Values Table

def missing_values_table(df):# Function to calculate missing values by column# Funct 
    mis_val = df.isnull().sum() # Total missing values
    mis_val_pct = 100 * df.isnull().sum() / len(df)# Percentage of missing values
    mis_val_df = pd.concat([mis_val, mis_val_pct], axis=1)# Make a table with the results
    mis_val_df_cols = mis_val_df.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# Rename the columns
    mis_val_df_cols = mis_val_df_cols[mis_val_df_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# Sort the table by percentage of missing descending
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n" 
           "There are " + str(mis_val_df_cols.shape[0]) + " cols having missing values.")# Print some summary information
    return mis_val_df_cols # Return the dataframe with missing information

Basic Groupby Aggregation

def base_agg(df, cols):
    temp_df = df.groupby(['location','date']).count().reset_index()
    temp_df = temp_df[['location','date']]
    for i in cols:
        temp = df.groupby(['location','date']).agg({'{}'.format(i):{'{}_MAX'.format(i):'max',
                                                                    '{}_MIN'.format(i):'min',
                                                                    '{}_MEAN'.format(i):'mean',
                                                                   '{}_FIRST'.format(i):'first',
                                                                   '{}_LAST'.format(i):'last',
                                                                   '{}_STD'.format(i):'std'}})
        temp.columns = temp.columns.droplevel(0)
        temp = temp.reset_index()
        temp['{}_MAX_MIN_diff'.format(i)] = temp['{}_MAX'.format(i)]-temp['{}_MIN'.format(i)]
        temp['{}_FIRST_LAST_diff'.format(i)] = temp['{}_FIRST'.format(i)]-temp['{}_LAST'.format(i)]
        
        temp_df = temp_df.merge(temp, on=['location','date'], how='left')
    return temp_df
SEASON_S1 = SEASON.groupby(['batter_id','year']).agg({'avg':'mean','G':'sum','AB':'sum','R':'sum','H':'sum','2B':'sum','3B':'sum','HR':'sum','TB':'sum','RBI':'sum','SB':'sum','CS':'sum','BB':'sum','HBP':'sum','SO':'sum','GDP':'sum',
                                                     '1B':'sum','SLG':'mean','OBP':'mean','E':'sum','OPS':'mean'})
SEASON_S1 = SEASON_S1.unstack()
SEASON_S1.columns = SEASON_S1.columns.map('{0[1]}|{0[0]}'.format)
SEASON_S1.head(6)
agg = {"transacted_date":{"last_trans":"max"},
      "END_DATE":{"END_DATE":"max"}}
DF_no = DF.groupby(['store_id']).agg(agg)
DF_no.columns = DF_no.columns.droplevel(0) # μˆœμ„œ μ€‘μš” agg ν•˜κ³  droplevel ν•˜κ³  reset index
DF_no = DF_no.reset_index()
DF_no.head()

Filter

test_df.filter(regex=("station.*")).columns.tolist()

or operator:

test_df.filter(regex=("bus_route_id|station|in_out.*")).columns.tolist()

str.split

expand:

aa['code','name'] = aa['station_id'].str.split("_", expand=True)

Isna & Imputation

show df with NaN:

df[df.isna().any(axis=1)]

Imputation with loc:

df.loc[df['Rating'].isna(),'Rating'] = 0

Groupby & Transform

for df in [train_df, test_df]:
    print('Started')
    for col in rideandoff:
        df['uniq_bus_route_{}_sum'.format(col)] = df.groupby('uniq_bus_route')[col].transform('sum')
        df['uniq_bus_route_{}_min'.format(col)] = df.groupby('uniq_bus_route')[col].transform('min')
        df['uniq_bus_route_{}_max'.format(col)] = df.groupby('uniq_bus_route')[col].transform('max')
        df['uniq_bus_route_{}_count'.format(col)] = df.groupby('uniq_bus_route')[col].transform('count')
        df['uniq_bus_route_{}_std'.format(col)] = df.groupby('uniq_bus_route')[col].transform('std')
        print(col)

Shift

rideandoff = test_df.filter(regex=("ride.*")).columns.tolist()+test_df.filter(regex=("takeoff.*")).columns.tolist()

shifted_m1 = train_df.groupby("uid_1")[rideandoff].shift(-1)

shifted_m1.columns = 'lag_' + shifted_m1.columns

Fill NaN

test1_g[null_list] = test1_g[null_list].fillna(0)

Sum

row-wise with specific columns:

df['row-wise-sum'] = df[col_list].sum(axis=1, skipna=True)

Value Counts unstack

day_list = ['Friday','Monday','Saturday','Sunday','Thursday','Tuesday','Wednesday']
train_df_new12 = train_df.groupby(['store_id','year','month'])['dayofweek'].value_counts(dropna=False, normalize=True).unstack()
# -------CAUTION COLUMNS NAMES ARE ORDERED BY ALPHABETICAL.----------
train_df_new12.columns = ['day_ratio'+i for i in day_list]
train_df_new12 = train_df_new12.reset_index()

Select_dtypes

total_df.select_dtypes(include=[object,'category']).columns

To Binary

df['flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0})

Frequency Encoding (qcut)

for i in list(train.columns.values[2:202]):
    encoding_i = train.groupby(i).size()
    encoding_i = encoding_i/len(train)
    train[i+'_qcut_FQE'] = pd.qcut(train[i], 20, labels=np.arange(20)).astype(int)
    train[i+'_qcut_FQE'] = train[i].map(encoding_i)

Remove Duplicates columns

df = df.loc[:,~df.columns.duplicated()]

Replace inf to zero

df.replace([np.inf, -np.inf], 0, inplace=True)

Get column(index) name based on row-wise min

train_df['nearest_obsv'] = train_df[obsv_list].idxmin(axis=1)

Consecutive time difference

DF_DC['Days'] = (DF_DC.groupby('store_id', group_keys=False)
                .apply(lambda g: g['transacted_date'].diff().replace(0, np.nan).ffill()))

Select rows with condition

based on value with a specific column:

DF_no[DF_no['store_id'].isin(closed_ids)]

ffill Vs. bfill

import pandas as pd
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                [3, 4, np.nan, 1],
                [np.nan, np.nan, np.nan, 5],
                [np.nan, 3, np.nan, 4]],
                columns=list('ABCD'))
print(df)
print(df.fillna(method='ffill'))
# fill NaN going forward
print(df.fillna(method='bfill'))
# fill NaN going backward
     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5
3  NaN  3.0 NaN  4
     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  3.0  4.0 NaN  5
3  3.0  3.0 NaN  4
     A    B   C  D
0  3.0  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  3.0 NaN  5
3  NaN  3.0 NaN  4

Pandas float_format

pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.float_format', None) #μ›λž˜μƒνƒœλ‘œ

Week number with codition

def conditions(x):
    if x < 8:
        return 1
    elif x < 15:
        return 2
    elif x < 22:
        return 3
    else:
        return 4

func = np.vectorize(conditions)
week_num = func(train_payment["day"])
train_payment["week_num"] = week_num

Leave a comment