Instacart - Feature Engineering

14 minute read

In this notebook we will mostly use Pandas to engineer features from the Instacart Market Basket Analysis Kaggle competition. In the original Kaggle competition, the model had to predict the products that would be reordered in the last order for each customer.

In this notebook we will engineer features in such a way to try and build a simple reordered prediction model in which the time series data is flattened, so to speak, so we do not have to use computationally expensive recurrent neural network architectures such as LSTMs.

Support Functions

from IPython.display import Markdown, display
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
#----------------------------------------------------------------------
# Functions to load datasets into memory using space efficient data types.

def load_orders(path):
    def convert_eval_set(value):
        if 'prior' == value:
            return np.uint8(1)
        elif 'train' == value:
            return np.uint8(2)
        else:
            return np.uint8(3) # 'test'

    def convert_days_since_prior_order(value):
        # 30 is the maximum value
        if '' == value:
            return np.int8(-1)
        else:
            return np.int8(np.float(value))

    orders = pd.read_csv(path,
                         dtype={'order_id': np.uint32,
                                'user_id': np.uint32,
                                'order_number': np.uint8,
                                'order_dow': np.uint8,
                                'order_hour_of_day': np.uint8},
                         converters={'eval_set':convert_eval_set,
                                     'days_since_prior_order':convert_days_since_prior_order})

    orders = orders.astype({'eval_set': np.uint8, 
                            'days_since_prior_order': np.int8})
    
    return orders

def load_orders_prods(path):
    return pd.read_csv(path, dtype={'order_id': np.uint32,
                                    'product_id': np.uint32,
                                    'add_to_cart_order': np.uint8,
                                    'reordered': np.uint8})

def load_products(path):
    return pd.read_csv(path, dtype={'product_id': np.uint16,
                                    'aisle_id': np.uint8,
                                    'department_id': np.uint8})

def load_aisles(path):
    return pd.read_csv(path, dtype={'aisle_id': np.uint8})

def load_depts(path):
    return pd.read_csv(path, dtype={'department_id': np.uint8})

#----------------------------------------------------------------------
# Functions to retrieve orders and products for a specific customer

def GetOrdersList(user_id, exclude_first_order=False):
    print("Retrieving orders for user ID: {0}".format(user_id))
    if exclude_first_order:
        return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior') & (orders.order_number > 1)].sort_values('order_number', ascending=True).order_id
    else:
        return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior')].sort_values('order_number', ascending=True).order_id

def GetProductsForOrder(order_id, prior_orders=True):
    if prior_orders:
        return prior_prods[prior_prods.order_id == order_id]
    else:
        return last_prods[last_prods.order_id == order_id]
    
def GetProductsForAllOrdersByCust(user_id, unique_products_only=False, exclude_first_order=False):
    orders_list = GetOrdersList(user_id, exclude_first_order)
    if unique_products_only:
        return prior_prods[prior_prods.order_id.isin(orders_list)].product_id.unique()
    else:
        return prior_prods[prior_prods.order_id.isin(orders_list)]

#----------------------------------------------------------------------
# Function to generate markdown output
# Ref: https://stackoverflow.com/a/32035217
def printmd(string):
    display(Markdown(string))

Load Training Data

orders = load_orders('data/split/sf_val_set_orders.csv')
orders_prods = load_orders_prods('data/split/sf_val_set_prior_order_products.csv')
products = load_products('data/original/products.csv')
aisles = load_aisles('data/original/aisles.csv')
depts = load_depts('data/original/departments.csv')
# List of orders in history
prior_orders_only = orders[(1 == orders.eval_set)]
final_orders_only = orders[(1 != orders.eval_set)]

Meta Features - Mean Order Length and Mean Reorder Ratio Per Customer

# Compute mean order length per customer.

orders_length = orders_prods.groupby('order_id').add_to_cart_order.max().reset_index()
orders_length.rename(columns={'add_to_cart_order': 'total_items_ordered'}, inplace=True)
orders_length_merge = orders_length.merge(prior_orders_only[['order_id','user_id']], on='order_id')
orders_length_merge['order_id'] = orders_length_merge.order_id.astype(np.uint32)

mean_order_length_per_customer = orders_length_merge.groupby('user_id').total_items_ordered.mean().round().reset_index()
mean_order_length_per_customer['user_id'] = mean_order_length_per_customer.user_id.astype(np.uint32)
mean_order_length_per_customer.rename(columns={'total_items_ordered': 'mean_order_length'}, inplace=True)
mean_order_length_per_customer['mean_order_length'] = mean_order_length_per_customer.mean_order_length.astype(np.uint16)

del orders_length_merge

# Compute mean reorder ratio per customer.

# For each order compute ratio of re-ordered items to total ordered items.
orders_reorder_ratio = orders_prods.groupby('order_id').reordered.sum() / orders_length.set_index('order_id').total_items_ordered
orders_reorder_ratio = orders_reorder_ratio.reset_index()

del orders_length

# Exclude first orders, since none of the products have been ordered yet, 
# and so reordered ratio would be zero, thus skewing the mean reorder ratio
# both overall and per user.
orders_reorder_ratio = orders_reorder_ratio.merge(prior_orders_only[prior_orders_only.order_number > 1], on='order_id')
orders_reorder_ratio.rename(columns={0: 'reorder_ratio'}, inplace=True)
orders_reorder_ratio['order_id'] = orders_reorder_ratio.order_id.astype(np.uint32)

mean_reorder_ratio_per_customer = orders_reorder_ratio.groupby('user_id').reorder_ratio.mean().reset_index()
mean_reorder_ratio_per_customer['user_id'] = mean_reorder_ratio_per_customer.user_id.astype(np.uint32)
mean_reorder_ratio_per_customer.rename(columns={'reorder_ratio': 'mean_reorder_ratio'}, inplace=True)
mean_reorder_ratio_per_customer['mean_reorder_ratio'] = mean_reorder_ratio_per_customer.mean_reorder_ratio.astype(np.float16)

del orders_reorder_ratio
mean_order_length_per_customer.head()
user_id mean_order_length
0 1 6
1 2 14
2 5 9
3 7 10
4 8 16
mean_reorder_ratio_per_customer.head()
user_id mean_reorder_ratio
0 1 0.784180
1 2 0.482422
2 5 0.503906
3 7 0.793945
4 8 0.458984

Feature Engineering

Merge Prior Orders, Ordered Products, Aisles and Departments

flat_order_prods = orders_prods.merge(prior_orders_only, on='order_id')
flat_order_prods = flat_order_prods.merge(products[['product_id','aisle_id','department_id']], on='product_id')
flat_order_prods.head()
order_id product_id add_to_cart_order reordered user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order aisle_id department_id
0 6 40462 1 0 22352 1 4 1 12 30 31 7
1 21553 40462 1 1 31136 1 13 1 13 25 31 7
2 208931 40462 1 1 7137 1 4 2 9 6 31 7
3 424301 40462 1 1 31136 1 7 2 11 12 31 7
4 455382 40462 20 0 33665 1 2 0 19 7 31 7

Product Probabilities per Customer

product_probs_per_cust = flat_order_prods.groupby(['user_id','product_id']).reordered.sum() + 1
product_probs_per_cust = product_probs_per_cust / product_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
product_probs_per_cust = product_probs_per_cust.reset_index()
product_probs_per_cust['user_id'] = product_probs_per_cust.user_id.astype(np.uint32)
product_probs_per_cust['product_id'] = product_probs_per_cust.product_id.astype(np.uint32)
product_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
product_probs_per_cust.head()
user_id product_id prob
0 1 196 0.169492
1 1 10258 0.152542
2 1 10326 0.016949
3 1 12427 0.169492
4 1 13032 0.050847

Aisle Probabilities per Customer

aisle_probs_per_cust = flat_order_prods.groupby(['user_id','aisle_id']).reordered.sum() + 1
aisle_probs_per_cust = aisle_probs_per_cust / aisle_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
aisle_probs_per_cust = aisle_probs_per_cust.reset_index()
aisle_probs_per_cust['user_id'] = aisle_probs_per_cust.user_id.astype(np.uint32)
aisle_probs_per_cust['aisle_id'] = aisle_probs_per_cust.aisle_id.astype(np.uint8)
aisle_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
aisle_probs_per_cust.head()
user_id aisle_id prob
0 1 21 0.150943
1 1 23 0.207547
2 1 24 0.037736
3 1 45 0.018868
4 1 53 0.037736

Department Probabilities per Customer

dept_probs_per_cust = flat_order_prods.groupby(['user_id','department_id']).reordered.sum() + 1
dept_probs_per_cust = dept_probs_per_cust / dept_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
dept_probs_per_cust = dept_probs_per_cust.reset_index()
dept_probs_per_cust['user_id'] = dept_probs_per_cust.user_id.astype(np.uint32)
dept_probs_per_cust['department_id'] = dept_probs_per_cust.department_id.astype(np.uint8)
dept_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
dept_probs_per_cust.head()
user_id department_id prob
0 1 4 0.041667
1 1 7 0.250000
2 1 13 0.020833
3 1 14 0.062500
4 1 16 0.187500

Days Since First Order (DSFO) per Order per Customer

DSFO_popc = prior_orders_only.copy()
# days since first order per order per customer
# add one since each users' first order has days_since_prior_order set to -1.
DSFO_popc['DSFO'] = DSFO_popc.groupby(['user_id']).days_since_prior_order.cumsum() + 1
DSFO_popc['DSFO'] = DSFO_popc.DSFO.astype(np.uint16)
del DSFO_popc['eval_set']
del DSFO_popc['order_number']
del DSFO_popc['order_dow']
del DSFO_popc['order_hour_of_day']
del DSFO_popc['days_since_prior_order']
DSFO_popc.head()
order_id user_id DSFO
0 2539329 1 0
1 2398795 1 15
2 473747 1 36
3 2254736 1 65
4 431534 1 93

Max Days Since First Order (DSFO) per Customer

max_DSFO_pc = DSFO_popc.groupby(['user_id']).DSFO.max().reset_index()
max_DSFO_pc.rename(columns={'DSFO': 'max_DSFO'}, inplace=True)
max_DSFO_pc.head()
user_id max_DSFO
0 1 176
1 2 198
2 5 40
3 7 203
4 8 60

Number of Orders per Customer

orders_pc = prior_orders_only.groupby('user_id').order_number.max().reset_index()
orders_pc['user_id'] = orders_pc.user_id.astype(np.uint32)
orders_pc.rename(columns={'order_number': 'number_of_orders'}, inplace=True)
orders_pc.head()
user_id number_of_orders
0 1 10
1 2 14
2 5 4
3 7 20
4 8 3

Final Summary for Products Ordered per Customer - Training Data

# days since first order per product per order per customer
props_pppc = flat_order_prods[['order_id','product_id','aisle_id','department_id','reordered']].merge(DSFO_popc, on="order_id")

# aggregate to get properties for each product ordered for each customer
props_pppc = props_pppc.groupby(['user_id','product_id']).agg({'DSFO': [min, max],
                                                               'reordered': sum,
                                                               'aisle_id': max,
                                                               'department_id': max})

# flatten hierarchical column index
props_pppc = props_pppc.reset_index()
props_pppc.columns = ['_'.join(col).strip('_') for col in props_pppc.columns.values]

# add max_DSFO and total orders per customer
props_pppc = props_pppc.merge(max_DSFO_pc, on='user_id')
props_pppc = props_pppc.merge(orders_pc, on='user_id')

# change data types for space efficiency
props_pppc['user_id'] = props_pppc.user_id.astype(np.uint32)
props_pppc['product_id'] = props_pppc.product_id.astype(np.uint32)

# rename columns to more proper name following aggregation
props_pppc.rename(columns={'aisle_id_max': 'aisle_id'}, inplace=True)
props_pppc.rename(columns={'department_id_max': 'department_id'}, inplace=True)

# add product probabilities per customer
props_pppc = props_pppc.merge(product_probs_per_cust, on=['user_id','product_id'])
props_pppc.rename(columns={'prob': 'prod_prob'}, inplace=True)

# add aisle probabilities per customer
props_pppc = props_pppc.merge(aisle_probs_per_cust, on=['user_id','aisle_id'])
props_pppc.rename(columns={'prob': 'aisle_prob'}, inplace=True)

# add department probabilities per customer
props_pppc = props_pppc.merge(dept_probs_per_cust, on=['user_id','department_id'])
props_pppc.rename(columns={'prob': 'department_prob'}, inplace=True)

# add days since last order for the customer's final order
props_pppc = props_pppc.merge(final_orders_only[['user_id','days_since_prior_order']], on="user_id")
props_pppc.rename(columns={'days_since_prior_order': 'last_order_DSLO'}, inplace=True)

# compute reorder and recency probability along with the mean days to order each product.
props_pppc['reorder_prob'] = (props_pppc['reordered_sum'] + 1) / props_pppc['number_of_orders']
# check that DSFO_max is greater than zero to avoid NaN, since some customers might have only 
# multiple orders on same day that the first order was placed.
props_pppc['recency_prob'] = (props_pppc['DSFO_max'] / (props_pppc['max_DSFO'] + props_pppc['last_order_DSLO'])).where(props_pppc['last_order_DSLO'] > 0, 0)

# DTOP - days to order product
props_pppc['mean_DTOP'] = ((props_pppc['DSFO_max'] - props_pppc['DSFO_min']) / props_pppc['reordered_sum']).where(props_pppc['DSFO_max'] > props_pppc['DSFO_min'], props_pppc['DSFO_max'])
props_pppc['mean_DTOP'] = props_pppc.mean_DTOP.astype(np.float16)

# compute DTOP probability
props_pppc['DTOP_prob'] = ((props_pppc['max_DSFO'] + props_pppc['last_order_DSLO']) - props_pppc['DSFO_max']) / props_pppc['mean_DTOP']
# replace NaNs with zero probability
props_pppc['DTOP_prob'].fillna(0, inplace=True)
# limit maximum value to 1 since it is a probability
props_pppc['DTOP_prob'] = props_pppc['DTOP_prob'].clip(lower=0, upper=0.9999999)

# change all float64 fields to float16
props_pppc['prod_prob'] = props_pppc.prod_prob.astype(np.float16)
props_pppc['aisle_prob'] = props_pppc.aisle_prob.astype(np.float16)
props_pppc['department_prob'] = props_pppc.department_prob.astype(np.float16)
props_pppc['reorder_prob'] = props_pppc.reorder_prob.astype(np.float16)
props_pppc['recency_prob'] = props_pppc.recency_prob.astype(np.float16)
props_pppc['DTOP_prob'] = props_pppc.DTOP_prob.astype(np.float16)

# add reordered column - setting all to zero - binary field
props_pppc['reordered'] = 0
props_pppc['reordered'] = props_pppc.reordered.astype(np.uint8)
# drop the columns we no longer need
del props_pppc['DSFO_min']
del props_pppc['DSFO_max']
del props_pppc['reordered_sum']
del props_pppc['aisle_id']
del props_pppc['department_id']
del props_pppc['max_DSFO']
del props_pppc['number_of_orders']
del props_pppc['last_order_DSLO']
del props_pppc['mean_DTOP']

props_pppc.head()
user_id product_id prod_prob aisle_prob department_prob reorder_prob recency_prob DTOP_prob reordered
0 1 196 0.169434 0.22644 0.250000 1.000000 0.926270 0.715820 0
1 1 46149 0.050842 0.22644 0.250000 0.300049 0.926270 0.933105 0
2 1 10258 0.152588 0.16980 0.395752 0.899902 0.926270 0.695801 0
3 1 12427 0.169434 0.20752 0.395752 1.000000 0.926270 0.715820 0
4 1 26088 0.033905 0.20752 0.395752 0.199951 0.078918 1.000000 0

Load Last Order Products to Mark Reordered Products per Customer

last_reordered_prods = load_orders_prods('data/split/sf_val_set_last_order_products.csv')
last_reordered_prods = last_reordered_prods[last_reordered_prods.reordered == 1][['order_id','product_id','reordered']]
last_reordered_prods = final_orders_only[['order_id','user_id']].merge(last_reordered_prods, on="order_id")
last_reordered_prods.head()
order_id user_id product_id reordered
0 1187899 1 196 1
1 1187899 1 25133 1
2 1187899 1 38928 1
3 1187899 1 26405 1
4 1187899 1 39657 1
keys = ['user_id','product_id']
i1 = props_pppc.set_index(keys).index
i2 = last_reordered_prods.set_index(keys).index
props_pppc.loc[i1.isin(i2), 'reordered'] = 1
props_pppc.head()
user_id product_id prod_prob aisle_prob department_prob reorder_prob recency_prob DTOP_prob reordered
0 1 196 0.169434 0.22644 0.250000 1.000000 0.926270 0.715820 1
1 1 46149 0.050842 0.22644 0.250000 0.300049 0.926270 0.933105 1
2 1 10258 0.152588 0.16980 0.395752 0.899902 0.926270 0.695801 1
3 1 12427 0.169434 0.20752 0.395752 1.000000 0.926270 0.715820 0
4 1 26088 0.033905 0.20752 0.395752 0.199951 0.078918 1.000000 1
props_pppc.to_csv('validation.csv')