本章内容包括:用户消费行为RFM分层模型
//input1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
columns = ['user_id','order_dt','order_products','order_amount']
df = pd.read_table('CDNOW_master.txt',names = columns,sep = '\s+')
//user_id:用户ID
//order_dt:购买日期
//orderproducts:购买产品数
//order_amount:购买金额
df['order_dt'] = pd.to_datetime(df.order_dt,format='%Y%m%d')
df['month'] = df.order_dt.values.astype('datetime64[M]')
df.info()
//output1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69659 entries, 0 to 69658
Data columns (total 5 columns):
user_id 69659 non-null int64
order_dt 69659 non-null datetime64[ns]
order_products 69659 non-null int64
order_amount 69659 non-null float64
month 69659 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 2.7 MB
//input2
df.head()
//output2
user_id order_dt order_products order_amount month
0 1 1997-01-01 1 11.77 1997-01-01
1 2 1997-01-12 1 12.00 1997-01-01
2 2 1997-01-12 5 77.00 1997-01-01
3 3 1997-01-02 2 20.76 1997-01-01
4 3 1997-03-30 2 20.76 1997-03-01
//input3
grouped_user = df.groupby('user_id')
grouped_user.sum().describe()
//output3
order_products order_amount
count 23570.000000 23570.000000
mean 7.122656 106.080426
std 16.983531 240.925195
min 1.000000 0.000000
25% 1.000000 19.970000
50% 3.000000 43.395000
75% 7.000000 106.475000
max 1033.000000 13990.930000
//input4
rfm = df.pivot_table(index = 'user_id',
values = ['order_products','order_amount','order_dt'],
aggfunc = {'order_dt':'max',
'order_amount':'sum',
'order_products':'sum'
})
rfm.head()
//output4
order_amount order_dt order_products
user_id
1 11.77 1997-01-01 1
2 89.00 1997-01-12 6
3 156.46 1998-05-28 16
4 100.50 1997-12-12 7
5 385.61 1998-01-03 29
//input5
rfm [['R','F','M']].apply(lambda x:x-x.mean()).head()
//output5
R F M
user_id
1 177.778362 -6.122656 -94.310426
2 166.778362 -1.122656 -17.080426
3 -334.221638 8.877344 50.379574
4 -167.221638 -0.122656 -5.580426
5 -189.221638 21.877344 279.529574
//input6
rfm['R'] = -(rfm.order_dt - rfm.order_dt.max()) / np.timedelta64(1,'D')
rfm.rename(columns = {'order_amount':'M','order_products':'F'},inplace = True)
def rfm_func(x):
level = x.apply(lambda x:'1' if x>=1 else '0')
label = level.R +level.F +level.M
d = {
'111':'重要价值客户',
'011':'重要保持客户',
'101':'重要发展客户',
'001':'重要挽留客户',
'110':'一般价值客户',
'010':'一般保持客户',
'100':'一般发展客户',
'000':'一般挽留客户',
}
result = d[label]
return result
rfm['label'] = rfm [['R','F','M']].apply(lambda x:x-x.mean()).apply(rfm_func,axis = 1) #逐行应用
rfm.loc[rfm.label == '重要价值客户','color'] = 'g'
rfm.loc[~(rfm.label == '重要价值客户'),'color'] = 'r'
rfm.plot.scatter('F','R',c=rfm.color)
//output6

//input7
rfm.groupby('label').sum()
//output7
M F R
label
一般价值客户 1767.11 182 8512.0
一般保持客户 5100.77 492 7782.0
一般发展客户 445233.28 29915 6983699.0
一般挽留客户 215075.77 15428 621894.0
重要价值客户 147180.09 9849 286676.0
重要保持客户 1555586.51 105509 476502.0
重要发展客户 49905.80 2322 174340.0
重要挽留客户 80466.30 4184 96009.0
//input8
rfm.groupby('label').count()
//output8
M order_dt F R color
label
一般价值客户 18 18 18 18 18
一般保持客户 53 53 53 53 53
一般发展客户 14138 14138 14138 14138 14138
一般挽留客户 3493 3493 3493 3493 3493
重要价值客户 631 631 631 631 631
重要保持客户 4267 4267 4267 4267 4267
重要发展客户 371 371 371 371 371
重要挽留客户 599 599 599 599 599
网友评论