Stefan Fiott

Instacart - Feature Engineering

First published: 28 Aug 2017
Last updated: 28 Aug 2017

Introduction

In this notebook we will mostly use Pandas to engineer features from the Instacart Market Basket Analysis Kaggle competition. In the original Kaggle competition, the model had to predict the products that would be reordered in the last order for each customer.

In this notebook we will engineer features in such a way to try and build a simple reordered prediction model in which the time series data is flattened, so to speak, so we do not have to use computationally expensive recurrent neural network architectures such as LSTMs.

Support Functions

In [1]:
from IPython.display import Markdown, display
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
In [2]:
#----------------------------------------------------------------------
# Functions to load datasets into memory using space efficient data types.

def load_orders(path):
    def convert_eval_set(value):
        if 'prior' == value:
            return np.uint8(1)
        elif 'train' == value:
            return np.uint8(2)
        else:
            return np.uint8(3) # 'test'

    def convert_days_since_prior_order(value):
        # 30 is the maximum value
        if '' == value:
            return np.int8(-1)
        else:
            return np.int8(np.float(value))

    orders = pd.read_csv(path,
                         dtype={'order_id': np.uint32,
                                'user_id': np.uint32,
                                'order_number': np.uint8,
                                'order_dow': np.uint8,
                                'order_hour_of_day': np.uint8},
                         converters={'eval_set':convert_eval_set,
                                     'days_since_prior_order':convert_days_since_prior_order})

    orders = orders.astype({'eval_set': np.uint8, 
                            'days_since_prior_order': np.int8})
    
    return orders

def load_orders_prods(path):
    return pd.read_csv(path, dtype={'order_id': np.uint32,
                                    'product_id': np.uint32,
                                    'add_to_cart_order': np.uint8,
                                    'reordered': np.uint8})

def load_products(path):
    return pd.read_csv(path, dtype={'product_id': np.uint16,
                                    'aisle_id': np.uint8,
                                    'department_id': np.uint8})

def load_aisles(path):
    return pd.read_csv(path, dtype={'aisle_id': np.uint8})

def load_depts(path):
    return pd.read_csv(path, dtype={'department_id': np.uint8})

#----------------------------------------------------------------------
# Functions to retrieve orders and products for a specific customer

def GetOrdersList(user_id, exclude_first_order=False):
    print("Retrieving orders for user ID: {0}".format(user_id))
    if exclude_first_order:
        return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior') & (orders.order_number > 1)].sort_values('order_number', ascending=True).order_id
    else:
        return orders[(orders.user_id == user_id) & (orders.eval_set == 'prior')].sort_values('order_number', ascending=True).order_id

def GetProductsForOrder(order_id, prior_orders=True):
    if prior_orders:
        return prior_prods[prior_prods.order_id == order_id]
    else:
        return last_prods[last_prods.order_id == order_id]
    
def GetProductsForAllOrdersByCust(user_id, unique_products_only=False, exclude_first_order=False):
    orders_list = GetOrdersList(user_id, exclude_first_order)
    if unique_products_only:
        return prior_prods[prior_prods.order_id.isin(orders_list)].product_id.unique()
    else:
        return prior_prods[prior_prods.order_id.isin(orders_list)]

#----------------------------------------------------------------------
# Function to generate markdown output
# Ref: https://stackoverflow.com/a/32035217
def printmd(string):
    display(Markdown(string))

Load Training Data

In [3]:
orders = load_orders('data/split/sf_train_set_orders.csv')
orders_prods = load_orders_prods('data/split/sf_train_set_prior_order_products.csv')
products = load_products('data/original/products.csv')
aisles = load_aisles('data/original/aisles.csv')
depts = load_depts('data/original/departments.csv')
In [4]:
# List of orders in history
prior_orders_only = orders[(1 == orders.eval_set)]
final_orders_only = orders[(1 != orders.eval_set)]

Meta Features - Mean Order Length and Mean Reorder Ratio Per Customer

In [5]:
# Compute mean order length per customer.

orders_length = orders_prods.groupby('order_id').add_to_cart_order.max().reset_index()
orders_length.rename(columns={'add_to_cart_order': 'total_items_ordered'}, inplace=True)
orders_length_merge = orders_length.merge(prior_orders_only[['order_id','user_id']], on='order_id')
orders_length_merge['order_id'] = orders_length_merge.order_id.astype(np.uint32)

mean_order_length_per_customer = orders_length_merge.groupby('user_id').total_items_ordered.mean().round().reset_index()
mean_order_length_per_customer['user_id'] = mean_order_length_per_customer.user_id.astype(np.uint32)
mean_order_length_per_customer.rename(columns={'total_items_ordered': 'mean_order_length'}, inplace=True)
mean_order_length_per_customer['mean_order_length'] = mean_order_length_per_customer.mean_order_length.astype(np.uint16)

del orders_length_merge

# Compute mean reorder ratio per customer.

# For each order compute ratio of re-ordered items to total ordered items.
orders_reorder_ratio = orders_prods.groupby('order_id').reordered.sum() / orders_length.set_index('order_id').total_items_ordered
orders_reorder_ratio = orders_reorder_ratio.reset_index()

del orders_length

# Exclude first orders, since none of the products have been ordered yet, 
# and so reordered ratio would be zero, thus skewing the mean reorder ratio
# both overall and per user.
orders_reorder_ratio = orders_reorder_ratio.merge(prior_orders_only[prior_orders_only.order_number > 1], on='order_id')
orders_reorder_ratio.rename(columns={0: 'reorder_ratio'}, inplace=True)
orders_reorder_ratio['order_id'] = orders_reorder_ratio.order_id.astype(np.uint32)

mean_reorder_ratio_per_customer = orders_reorder_ratio.groupby('user_id').reorder_ratio.mean().reset_index()
mean_reorder_ratio_per_customer['user_id'] = mean_reorder_ratio_per_customer.user_id.astype(np.uint32)
mean_reorder_ratio_per_customer.rename(columns={'reorder_ratio': 'mean_reorder_ratio'}, inplace=True)
mean_reorder_ratio_per_customer['mean_reorder_ratio'] = mean_reorder_ratio_per_customer.mean_reorder_ratio.astype(np.float16)

del orders_reorder_ratio
In [6]:
mean_order_length_per_customer.head()
Out[6]:
user_id mean_order_length
0 41217 8
1 41219 11
2 41220 4
3 41221 3
4 41222 12
In [7]:
mean_reorder_ratio_per_customer.head()
Out[7]:
user_id mean_reorder_ratio
0 41217 0.701172
1 41219 0.602539
2 41220 0.819336
3 41221 0.250000
4 41222 0.672363

Feature Engineering

Merge Prior Orders, Ordered Products, Aisles and Departments

In [8]:
flat_order_prods = orders_prods.merge(prior_orders_only, on='order_id')
flat_order_prods = flat_order_prods.merge(products[['product_id','aisle_id','department_id']], on='product_id')
In [9]:
flat_order_prods.head()
Out[9]:
order_id product_id add_to_cart_order reordered user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order aisle_id department_id
0 2 33120 1 1 202279 1 3 5 9 8 86 16
1 26 33120 5 0 153404 1 2 0 16 7 86 16
2 327 33120 5 1 58707 1 21 6 9 8 86 16
3 537 33120 2 1 180135 1 15 2 8 3 86 16
4 582 33120 7 1 193223 1 6 2 19 10 86 16

Product Probabilities per Customer

In [10]:
product_probs_per_cust = flat_order_prods.groupby(['user_id','product_id']).reordered.sum() + 1
product_probs_per_cust = product_probs_per_cust / product_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
product_probs_per_cust = product_probs_per_cust.reset_index()
product_probs_per_cust['user_id'] = product_probs_per_cust.user_id.astype(np.uint32)
product_probs_per_cust['product_id'] = product_probs_per_cust.product_id.astype(np.uint32)
product_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
product_probs_per_cust.head()
Out[10]:
user_id product_id prob
0 41217 1747 0.033333
1 41217 2338 0.008333
2 41217 3177 0.008333
3 41217 3919 0.008333
4 41217 3990 0.008333

Aisle Probabilities per Customer

In [11]:
aisle_probs_per_cust = flat_order_prods.groupby(['user_id','aisle_id']).reordered.sum() + 1
aisle_probs_per_cust = aisle_probs_per_cust / aisle_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
aisle_probs_per_cust = aisle_probs_per_cust.reset_index()
aisle_probs_per_cust['user_id'] = aisle_probs_per_cust.user_id.astype(np.uint32)
aisle_probs_per_cust['aisle_id'] = aisle_probs_per_cust.aisle_id.astype(np.uint8)
aisle_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
aisle_probs_per_cust.head()
Out[11]:
user_id aisle_id prob
0 41217 5 0.009346
1 41217 9 0.018692
2 41217 21 0.009346
3 41217 23 0.037383
4 41217 24 0.056075

Department Probabilities per Customer

In [12]:
dept_probs_per_cust = flat_order_prods.groupby(['user_id','department_id']).reordered.sum() + 1
dept_probs_per_cust = dept_probs_per_cust / dept_probs_per_cust.reset_index().groupby(['user_id']).reordered.sum()
dept_probs_per_cust = dept_probs_per_cust.reset_index()
dept_probs_per_cust['user_id'] = dept_probs_per_cust.user_id.astype(np.uint32)
dept_probs_per_cust['department_id'] = dept_probs_per_cust.department_id.astype(np.uint8)
dept_probs_per_cust.rename(columns={'reordered': 'prob'}, inplace=True)
dept_probs_per_cust.head()
Out[12]:
user_id department_id prob
0 41217 1 0.044444
1 41217 3 0.144444
2 41217 4 0.066667
3 41217 7 0.177778
4 41217 9 0.044444

Days Since First Order (DSFO) per Order per Customer

In [13]:
DSFO_popc = prior_orders_only.copy()
# days since first order per order per customer
# add one since each users' first order has days_since_prior_order set to -1.
DSFO_popc['DSFO'] = DSFO_popc.groupby(['user_id']).days_since_prior_order.cumsum() + 1
DSFO_popc['DSFO'] = DSFO_popc.DSFO.astype(np.uint16)
del DSFO_popc['eval_set']
del DSFO_popc['order_number']
del DSFO_popc['order_dow']
del DSFO_popc['order_hour_of_day']
del DSFO_popc['days_since_prior_order']
DSFO_popc.head()
Out[13]:
order_id user_id DSFO
0 59108 41217 0
1 1661069 41217 12
2 1543837 41217 41
3 1288419 41217 45
4 1031559 41217 53

Max Days Since First Order (DSFO) per Customer

In [14]:
max_DSFO_pc = DSFO_popc.groupby(['user_id']).DSFO.max().reset_index()
max_DSFO_pc.rename(columns={'DSFO': 'max_DSFO'}, inplace=True)
max_DSFO_pc.head()
Out[14]:
user_id max_DSFO
0 41217 182
1 41219 353
2 41220 185
3 41221 60
4 41222 109

Number of Orders per Customer

In [15]:
orders_pc = prior_orders_only.groupby('user_id').order_number.max().reset_index()
orders_pc['user_id'] = orders_pc.user_id.astype(np.uint32)
orders_pc.rename(columns={'order_number': 'number_of_orders'}, inplace=True)
orders_pc.head()
Out[15]:
user_id number_of_orders
0 41217 16
1 41219 48
2 41220 21
3 41221 3
4 41222 12

Final Summary for Products Ordered per Customer - Training Data

In [16]:
# days since first order per product per order per customer
props_pppc = flat_order_prods[['order_id','product_id','aisle_id','department_id','reordered']].merge(DSFO_popc, on="order_id")

# aggregate to get properties for each product ordered for each customer
props_pppc = props_pppc.groupby(['user_id','product_id']).agg({'DSFO': [min, max],
                                                               'reordered': sum,
                                                               'aisle_id': max,
                                                               'department_id': max})

# flatten hierarchical column index
props_pppc = props_pppc.reset_index()
props_pppc.columns = ['_'.join(col).strip('_') for col in props_pppc.columns.values]

# add max_DSFO and total orders per customer
props_pppc = props_pppc.merge(max_DSFO_pc, on='user_id')
props_pppc = props_pppc.merge(orders_pc, on='user_id')

# change data types for space efficiency
props_pppc['user_id'] = props_pppc.user_id.astype(np.uint32)
props_pppc['product_id'] = props_pppc.product_id.astype(np.uint32)

# rename columns to more proper name following aggregation
props_pppc.rename(columns={'aisle_id_max': 'aisle_id'}, inplace=True)
props_pppc.rename(columns={'department_id_max': 'department_id'}, inplace=True)

# add product probabilities per customer
props_pppc = props_pppc.merge(product_probs_per_cust, on=['user_id','product_id'])
props_pppc.rename(columns={'prob': 'prod_prob'}, inplace=True)

# add aisle probabilities per customer
props_pppc = props_pppc.merge(aisle_probs_per_cust, on=['user_id','aisle_id'])
props_pppc.rename(columns={'prob': 'aisle_prob'}, inplace=True)

# add department probabilities per customer
props_pppc = props_pppc.merge(dept_probs_per_cust, on=['user_id','department_id'])
props_pppc.rename(columns={'prob': 'department_prob'}, inplace=True)

# add days since last order for the customer's final order
props_pppc = props_pppc.merge(final_orders_only[['user_id','days_since_prior_order']], on="user_id")
props_pppc.rename(columns={'days_since_prior_order': 'last_order_DSLO'}, inplace=True)

# compute reorder and recency probability along with the mean days to order each product.
props_pppc['reorder_prob'] = (props_pppc['reordered_sum'] + 1) / props_pppc['number_of_orders']
# check that DSFO_max is greater than zero to avoid NaN, since some customers might have only 
# multiple orders on same day that the first order was placed.
props_pppc['recency_prob'] = (props_pppc['DSFO_max'] / (props_pppc['max_DSFO'] + props_pppc['last_order_DSLO'])).where(props_pppc['last_order_DSLO'] > 0, 0)

# DTOP - days to order product
props_pppc['mean_DTOP'] = ((props_pppc['DSFO_max'] - props_pppc['DSFO_min']) / props_pppc['reordered_sum']).where(props_pppc['DSFO_max'] > props_pppc['DSFO_min'], props_pppc['DSFO_max'])
props_pppc['mean_DTOP'] = props_pppc.mean_DTOP.astype(np.float16)

# compute DTOP probability
props_pppc['DTOP_prob'] = ((props_pppc['max_DSFO'] + props_pppc['last_order_DSLO']) - props_pppc['DSFO_max']) / props_pppc['mean_DTOP']
# replace NaNs with zero probability
props_pppc['DTOP_prob'].fillna(0, inplace=True)
# limit maximum value to 1 since it is a probability
props_pppc['DTOP_prob'] = props_pppc['DTOP_prob'].clip(lower=0, upper=0.9999999)

# change all float64 fields to float16
props_pppc['prod_prob'] = props_pppc.prod_prob.astype(np.float16)
props_pppc['aisle_prob'] = props_pppc.aisle_prob.astype(np.float16)
props_pppc['department_prob'] = props_pppc.department_prob.astype(np.float16)
props_pppc['reorder_prob'] = props_pppc.reorder_prob.astype(np.float16)
props_pppc['recency_prob'] = props_pppc.recency_prob.astype(np.float16)
props_pppc['DTOP_prob'] = props_pppc.DTOP_prob.astype(np.float16)

# add reordered column - setting all to zero - binary field
props_pppc['reordered'] = 0
props_pppc['reordered'] = props_pppc.reordered.astype(np.uint8)
In [17]:
# drop the columns we no longer need
del props_pppc['DSFO_min']
del props_pppc['DSFO_max']
del props_pppc['reordered_sum']
del props_pppc['aisle_id']
del props_pppc['department_id']
del props_pppc['max_DSFO']
del props_pppc['number_of_orders']
del props_pppc['last_order_DSLO']
del props_pppc['mean_DTOP']

props_pppc.head()
Out[17]:
user_id product_id prod_prob aisle_prob department_prob reorder_prob recency_prob DTOP_prob reordered
0 41217 1747 0.033325 0.037384 0.077759 0.2500 0.918945 0.360107 0
1 41217 40885 0.008331 0.037384 0.077759 0.0625 0.243286 1.000000 0
2 41217 42416 0.008331 0.037384 0.077759 0.0625 0.243286 1.000000 0
3 41217 47357 0.033325 0.037384 0.077759 0.2500 0.605469 1.000000 0
4 41217 2338 0.008331 0.009346 0.077759 0.0625 0.243286 1.000000 0

Load Last Order Products to Mark Reordered Products per Customer

In [18]:
last_reordered_prods = load_orders_prods('data/split/sf_train_set_last_order_products.csv')
last_reordered_prods = last_reordered_prods[last_reordered_prods.reordered == 1][['order_id','product_id','reordered']]
last_reordered_prods = final_orders_only[['order_id','user_id']].merge(last_reordered_prods, on="order_id")
last_reordered_prods.head()
Out[18]:
order_id user_id product_id reordered
0 3333539 41217 39108 1
1 3333539 41217 5077 1
2 3333539 41217 42252 1
3 3333539 41217 44475 1
4 3333539 41217 24852 1
In [19]:
keys = ['user_id','product_id']
i1 = props_pppc.set_index(keys).index
i2 = last_reordered_prods.set_index(keys).index
props_pppc.loc[i1.isin(i2), 'reordered'] = 1
props_pppc.head()
Out[19]:
user_id product_id prod_prob aisle_prob department_prob reorder_prob recency_prob DTOP_prob reordered
0 41217 1747 0.033325 0.037384 0.077759 0.2500 0.918945 0.360107 0
1 41217 40885 0.008331 0.037384 0.077759 0.0625 0.243286 1.000000 0
2 41217 42416 0.008331 0.037384 0.077759 0.0625 0.243286 1.000000 0
3 41217 47357 0.033325 0.037384 0.077759 0.2500 0.605469 1.000000 0
4 41217 2338 0.008331 0.009346 0.077759 0.0625 0.243286 1.000000 0
In [20]:
props_pppc.to_csv('training.csv')