数据挖掘实战
零售分析实战
1. 客户分群分析
使用RFM模型对客户进行分群,实现精准营销。
# RFM客户分群
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# 加载数据
def load_customer_data(file_path):
df = pd.read_csv(file_path)
return df
# 计算RFM指标
def calculate_rfm(df):
# 计算最近一次购买时间
df['recency'] = (pd.to_datetime('today') - pd.to_datetime(df['last_purchase'])).dt.days
# 计算购买频率
frequency = df.groupby('customer_id')['order_id'].count()
# 计算购买金额
monetary = df.groupby('customer_id')['amount'].sum()
# 合并RFM指标
rfm = pd.DataFrame({
'recency': df.groupby('customer_id')['recency'].min(),
'frequency': frequency,
'monetary': monetary
})
return rfm
# 客户分群
def customer_segmentation(rfm, n_clusters=4):
# 数据标准化
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)
# K-means聚类
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)
return rfm
# 分析客户群特征
def analyze_segments(rfm):
# 计算每个群的平均RFM值
segment_analysis = rfm.groupby('cluster').agg({
'recency': 'mean',
'frequency': 'mean',
'monetary': 'mean'
})
# 计算每个群的客户数量
segment_size = rfm['cluster'].value_counts()
return segment_analysis, segment_size
# 可视化结果
def plot_segments(rfm):
plt.figure(figsize=(12, 8))
# 绘制散点图
scatter = plt.scatter(
rfm['recency'],
rfm['frequency'],
c=rfm['cluster'],
cmap='viridis',
s=rfm['monetary']/100
)
plt.xlabel('最近购买时间')
plt.ylabel('购买频率')
plt.title('客户分群结果')
# 添加图例
legend1 = plt.legend(*scatter.legend_elements(),
title="客户群")
plt.add_artist(legend1)
plt.show()
# 应用示例
def retail_analysis(file_path):
# 加载数据
df = load_customer_data(file_path)
# 计算RFM指标
rfm = calculate_rfm(df)
# 客户分群
rfm = customer_segmentation(rfm)
# 分析结果
segment_analysis, segment_size = analyze_segments(rfm)
# 可视化
plot_segments(rfm)
return segment_analysis, segment_size2. 商品关联分析
使用Apriori算法挖掘商品之间的关联规则。
# 商品关联分析
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
# 加载数据
def load_transaction_data(file_path):
df = pd.read_csv(file_path)
return df
# 数据预处理
def preprocess_data(df):
# 创建购物篮数据
basket = df.pivot_table(
index='transaction_id',
columns='product_id',
values='quantity',
aggfunc='sum',
fill_value=0
)
# 转换为二进制数据
basket = (basket > 0).astype(int)
return basket
# 挖掘关联规则
def mine_association_rules(basket, min_support=0.01, min_confidence=0.5):
# 生成频繁项集
frequent_itemsets = apriori(
basket,
min_support=min_support,
use_colnames=True
)
# 生成关联规则
rules = association_rules(
frequent_itemsets,
metric='confidence',
min_threshold=min_confidence
)
return rules
# 可视化规则
def plot_rules(rules):
plt.figure(figsize=(10, 6))
plt.scatter(rules['support'], rules['confidence'],
alpha=0.5, s=rules['lift']*100)
plt.xlabel('支持度')
plt.ylabel('置信度')
plt.title('关联规则分布')
# 添加规则标签
for i, rule in rules.iterrows():
plt.annotate(
f"{rule['antecedents']} -> {rule['consequents']}",
(rule['support'], rule['confidence']),
xytext=(5, 5),
textcoords='offset points'
)
plt.show()
# 生成营销建议
def generate_recommendations(rules):
recommendations = []
for _, rule in rules.iterrows():
if rule['lift'] > 1:
recommendations.append({
'antecedent': rule['antecedents'],
'consequent': rule['consequents'],
'confidence': rule['confidence'],
'lift': rule['lift']
})
return recommendations
# 应用示例
def product_association_analysis(file_path):
# 加载数据
df = load_transaction_data(file_path)
# 数据预处理
basket = preprocess_data(df)
# 挖掘关联规则
rules = mine_association_rules(basket)
# 可视化规则
plot_rules(rules)
# 生成建议
recommendations = generate_recommendations(rules)
return recommendations