Python教程(四十五):推荐系统-个性化推荐算法

360影视 欧美动漫 2025-08-01 21:06 1

摘要:电商平台:商品推荐# - 视频网站:内容推荐# - 音乐平台:歌曲推荐# - 社交媒体:好友推荐# - 新闻网站:文章推荐# - 游戏平台:游戏推荐pip install pandas numpy matplotlib seaborn scikit-learn

协同过滤:基于用户或物品的相似性

基于内容:基于物品特征和用户偏好

混合推荐:结合多种推荐方法

# 主要应用领域:# - 电商平台:商品推荐# - 视频网站:内容推荐# - 音乐平台:歌曲推荐# - 社交媒体:好友推荐# - 新闻网站:文章推荐# - 游戏平台:游戏推荐pip install pandas numpy matplotlib seaborn scikit-learn scipy surpriseimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.metrics.pairwise import cosine_similarityfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.decomposition import NMF, TruncatedSVDfrom scipy.sparse import csr_matriximport warningswarnings.filterwarnings('ignore')# 设置中文字体plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# 设置Seaborn样式sns.set_style("whitegrid")print("推荐系统环境设置完成")def create_recommendation_data:"""创建推荐系统示例数据"""np.random.seed(42)print("=== 创建推荐系统数据 ===")# 1. 用户-物品评分矩阵n_users = 100n_items = 50# 生成稀疏评分矩阵ratings_data = for user_id in range(n_users):# 每个用户随机评价10-20个物品n_ratings = np.random.randint(10, 21)item_ids = np.random.choice(n_items, n_ratings, replace=False)for item_id in item_ids:# 生成1-5的评分rating = np.random.randint(1, 6)ratings_data.append({'user_id': user_id,'item_id': item_id,'rating': rating})ratings_df = pd.DataFrame(ratings_data)# 2. 物品特征数据categories = ['电影', '音乐', '书籍', '游戏', '电子产品']genres = ['动作', '喜剧', '科幻', '恐怖', '爱情', '纪录片', '动画']items_data = for item_id in range(n_items):category = np.random.choice(categories)genre = np.random.choice(genres)price = np.random.randint(10, 1000)items_data.append({'item_id': item_id,'name': f'物品{item_id}','category': category,'genre': genre,'price': price,'description': f'这是一个{category}类别的{genre}作品,价格{price}元'})items_df = pd.DataFrame(items_data)# 3. 用户特征数据ages = np.random.randint(18, 65, n_users)genders = np.random.choice(['男', '女'], n_users)users_data = for user_id in range(n_users):users_data.append({'user_id': user_id,'age': ages[user_id],'gender': genders[user_id],'preference': np.random.choice(categories)})users_df = pd.DataFrame(users_data)print(f"用户数量: {n_users}")print(f"物品数量: {n_items}")print(f"评分数量: {len(ratings_df)}")print(f"稀疏度: {len(ratings_df) / (n_users * n_items) * 100:.2f}%")# 4. 创建评分矩阵rating_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')rating_matrix = rating_matrix.fillna(0)# 可视化数据分布fig, axes = plt.subplots(2, 2, figsize=(15, 10))# 评分分布axes[0, 0].hist(ratings_df['rating'], bins=5, alpha=0.7, edgecolor='black')axes[0, 0].set_title('评分分布')axes[0, 0].set_xlabel('评分')axes[0, 0].set_ylabel('频次')axes[0, 0].grid(True, alpha=0.3)# 用户评分数量分布user_rating_counts = ratings_df.groupby('user_id').sizeaxes[0, 1].hist(user_rating_counts, bins=20, alpha=0.7, edgecolor='black')axes[0, 1].set_title('用户评分数量分布')axes[0, 1].set_xlabel('评分数量')axes[0, 1].set_ylabel('用户数量')axes[0, 1].grid(True, alpha=0.3)# 物品评分数量分布item_rating_counts = ratings_df.groupby('item_id').sizeaxes[1, 0].hist(item_rating_counts, bins=20, alpha=0.7, edgecolor='black')axes[1, 0].set_title('物品评分数量分布')axes[1, 0].set_xlabel('评分数量')axes[1, 0].set_ylabel('物品数量')axes[1, 0].grid(True, alpha=0.3)# 类别分布category_counts = items_df['category'].value_countsaxes[1, 1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')axes[1, 1].set_title('物品类别分布')plt.tight_layoutplt.showreturn {'ratings_df': ratings_df,'items_df': items_df,'users_df': users_df,'rating_matrix': rating_matrix}# 运行数据创建示例rec_data = create_recommendation_datadef user_based_collaborative_filtering:"""基于用户的协同过滤示例"""rating_matrix = rec_data['rating_matrix']print("=== 基于用户的协同过滤 ===")# 1. 计算用户相似度矩阵user_similarity = cosine_similarity(rating_matrix)user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)print(f"用户相似度矩阵形状: {user_similarity_df.shape}")# 2. 为用户推荐物品def recommend_for_user(user_id, n_recommendations=5):"""为用户推荐物品"""if user_id not in rating_matrix.index:return # 获取用户的评分user_ratings = rating_matrix.loc[user_id]# 找到用户未评分的物品unrated_items = user_ratings[user_ratings == 0].indexif len(unrated_items) == 0:return # 获取相似用户similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:11]# 计算预测评分predictions = {}for item_id in unrated_items:numerator = 0denominator = 0for similar_user_id, similarity in similar_users.items:if similarity > 0: # 只考虑正相似度similar_user_rating = rating_matrix.loc[similar_user_id, item_id]if similar_user_rating > 0: # 只考虑有评分的用户numerator += similarity * similar_user_ratingdenominator += similarityif denominator > 0:predictions[item_id] = numerator / denominator# 返回推荐结果recommendations = sorted(predictions.items, key=lambda x: x[1], reverse=True)return recommendations[:n_recommendations]# 3. 为示例用户推荐example_user = 0recommendations = recommend_for_user(example_user, n_recommendations=10)print(f"\n为用户{example_user}的推荐结果:")for item_id, predicted_rating in recommendations:item_name = rec_data['items_df'].loc[item_id, 'name']category = rec_data['items_df'].loc[item_id, 'category']print(f"物品{item_id} ({item_name}, {category}): 预测评分 {predicted_rating:.2f}")# 4. 可视化用户相似度plt.figure(figsize=(10, 8))sns.heatmap(user_similarity_df.iloc[:20, :20], cmap='coolwarm', center=0)plt.title('用户相似度热力图 (前20个用户)')plt.xlabel('用户ID')plt.ylabel('用户ID')plt.show# 5. 评估推荐质量def evaluate_recommendations(user_id, n_recommendations=5):"""评估推荐质量"""# 获取用户的实际评分user_ratings = rating_matrix.loc[user_id]rated_items = user_ratings[user_ratings > 0]if len(rated_items) 0 else 0recall = hits / len(test_items) if len(test_items) > 0 else 0return precision, recall# 评估多个用户evaluation_results = for user_id in range(min(20, len(rating_matrix))):result = evaluate_recommendations(user_id)if result is not None:evaluation_results.append(result)if evaluation_results:avg_precision = np.mean([r[0] for r in evaluation_results])avg_recall = np.mean([r[1] for r in evaluation_results])print(f"\n推荐质量评估 (基于{len(evaluation_results)}个用户):")print(f"平均精确率: {avg_precision:.3f}")print(f"平均召回率: {avg_recall:.3f}")return {'user_similarity': user_similarity_df,'recommendations': recommendations,'evaluation_results': evaluation_results}# 运行基于用户的协同过滤示例user_cf_results = user_based_collaborative_filteringdef item_based_collaborative_filtering:"""基于物品的协同过滤示例"""rating_matrix = rec_data['rating_matrix']print("=== 基于物品的协同过滤 ===")# 1. 计算物品相似度矩阵item_similarity = cosine_similarity(rating_matrix.T)item_similarity_df = pd.DataFrame(item_similarity, index=rating_matrix.columns, columns=rating_matrix.columns)print(f"物品相似度矩阵形状: {item_similarity_df.shape}")# 2. 为用户推荐物品def recommend_for_user_item_based(user_id, n_recommendations=5):"""基于物品的协同过滤推荐"""if user_id not in rating_matrix.index:return # 获取用户的评分user_ratings = rating_matrix.loc[user_id]# 找到用户未评分的物品unrated_items = user_ratings[user_ratings == 0].indexif len(unrated_items) == 0:return # 计算预测评分predictions = {}for item_id in unrated_items:numerator = 0denominator = 0# 获取用户已评分的物品rated_items = user_ratings[user_ratings > 0].indexfor rated_item_id in rated_items:similarity = item_similarity_df.loc[item_id, rated_item_id]if similarity > 0: # 只考虑正相似度rating = user_ratings[rated_item_id]numerator += similarity * ratingdenominator += similarityif denominator > 0:predictions[item_id] = numerator / denominator# 返回推荐结果recommendations = sorted(predictions.items, key=lambda x: x[1], reverse=True)return recommendations[:n_recommendations]# 3. 为示例用户推荐example_user = 0recommendations = recommend_for_user_item_based(example_user, n_recommendations=10)print(f"\n基于物品的协同过滤为用户{example_user}的推荐结果:")for item_id, predicted_rating in recommendations:item_name = rec_data['items_df'].loc[item_id, 'name']category = rec_data['items_df'].loc[item_id, 'category']print(f"物品{item_id} ({item_name}, {category}): 预测评分 {predicted_rating:.2f}")# 4. 可视化物品相似度plt.figure(figsize=(10, 8))sns.heatmap(item_similarity_df.iloc[:20, :20], cmap='coolwarm', center=0)plt.title('物品相似度热力图 (前20个物品)')plt.xlabel('物品ID')plt.ylabel('物品ID')plt.show# 5. 找到最相似的物品def find_similar_items(item_id, n_similar=5):"""找到最相似的物品"""if item_id not in item_similarity_df.index:return similar_items = item_similarity_df.loc[item_id].sort_values(ascending=False)[1:n_similar+1]return similar_itemsexample_item = 0similar_items = find_similar_items(example_item, n_similar=5)print(f"\n与物品{example_item}最相似的物品:")for item_id, similarity in similar_items.items:item_name = rec_data['items_df'].loc[item_id, 'name']category = rec_data['items_df'].loc[item_id, 'category']print(f"物品{item_id} ({item_name}, {category}): 相似度 {similarity:.3f}")return {'item_similarity': item_similarity_df,'recommendations': recommendations,'similar_items': similar_items}# 运行基于物品的协同过滤示例item_cf_results = item_based_collaborative_filteringdef content_based_recommendation:"""基于内容的推荐示例"""items_df = rec_data['items_df']users_df = rec_data['users_df']ratings_df = rec_data['ratings_df']print("=== 基于内容的推荐 ===")# 1. 提取物品特征# 使用TF-IDF提取文本特征tfidf = TfidfVectorizer(max_features=100, stop_words=None)item_features = tfidf.fit_transform(items_df['description'])print(f"物品特征矩阵形状: {item_features.shape}")# 2. 计算物品相似度item_content_similarity = cosine_similarity(item_features)item_content_similarity_df = pd.DataFrame(item_content_similarity,index=items_df.index,columns=items_df.index)# 3. 构建用户偏好模型def build_user_profile(user_id):"""构建用户偏好模型"""# 获取用户评分的物品user_ratings = ratings_df[ratings_df['user_id'] == user_id]if len(user_ratings) == 0:return None# 计算用户偏好向量user_profile = np.zeros(item_features.shape[1])for _, row in user_ratings.iterrows:item_id = row['item_id']rating = row['rating']# 加权物品特征item_feature = item_features[item_id].toarray.flattenuser_profile += rating * item_feature# 归一化if np.sum(user_profile) > 0:user_profile = user_profile / np.sum(user_profile)return user_profile# 4. 基于内容的推荐def recommend_content_based(user_id, n_recommendations=5):"""基于内容的推荐"""# 构建用户偏好模型user_profile = build_user_profile(user_id)if user_profile is None:return # 获取用户已评分的物品user_rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].tolist# 计算用户对未评分物品的偏好分数predictions = {}for item_id in range(len(items_df)):if item_id not in user_rated_items:item_feature = item_features[item_id].toarray.flattenscore = np.dot(user_profile, item_feature)predictions[item_id] = score# 返回推荐结果recommendations = sorted(predictions.items, key=lambda x: x[1], reverse=True)return recommendations[:n_recommendations]# 5. 为示例用户推荐example_user = 0recommendations = recommend_content_based(example_user, n_recommendations=10)print(f"\n基于内容的推荐为用户{example_user}的推荐结果:")for item_id, score in recommendations:item_name = items_df.loc[item_id, 'name']category = items_df.loc[item_id, 'category']description = items_df.loc[item_id, 'description']print(f"物品{item_id} ({item_name}, {category}): 偏好分数 {score:.3f}")print(f" 描述: {description}")# 6. 可视化物品特征# 使用PCA降维可视化from sklearn.decomposition import PCApca = PCA(n_components=2)item_features_2d = pca.fit_transform(item_features.toarray)plt.figure(figsize=(12, 8))# 按类别着色categories = items_df['category'].uniquecolors = plt.cm.Set3(np.linspace(0, 1, len(categories)))for i, category in enumerate(categories):mask = items_df['category'] == categoryplt.scatter(item_features_2d[mask, 0], item_features_2d[mask, 1], c=[colors[i]], label=category, alpha=0.7, s=50)plt.title('物品特征空间可视化 (PCA降维)')plt.xlabel('主成分1')plt.ylabel('主成分2')plt.legendplt.grid(True, alpha=0.3)plt.showreturn {'item_features': item_features,'item_content_similarity': item_content_similarity_df,'recommendations': recommendations,'pca_features': item_features_2d}# 运行基于内容的推荐示例content_based_results = content_based_recommendationdef matrix_factorization:"""矩阵分解方法示例"""rating_matrix = rec_data['rating_matrix']print("=== 矩阵分解方法 ===")# 1. 非负矩阵分解 (NMF)# 将评分矩阵转换为非负矩阵(添加偏移)rating_matrix_shifted = rating_matrix + 1 # 将0-4评分转换为1-5# 应用NMFn_components = 10nmf = NMF(n_components=n_components, random_state=42, max_iter=200)# 只对非零元素进行分解rating_matrix_sparse = csr_matrix(rating_matrix_shifted.values)# 使用NMF分解user_factors = nmf.fit_transform(rating_matrix_sparse)item_factors = nmf.components_print(f"用户因子矩阵形状: {user_factors.shape}")print(f"物品因子矩阵形状: {item_factors.shape}")print(f"NMF重构误差: {nmf.reconstruction_err_:.4f}")# 2. 奇异值分解 (SVD)# 使用TruncatedSVD进行SVD分解svd = TruncatedSVD(n_components=n_components, random_state=42)user_factors_svd = svd.fit_transform(rating_matrix_sparse)item_factors_svd = svd.components_print(f"SVD解释方差比例: {svd.explained_variance_ratio_.sum:.4f}")# 3. 基于矩阵分解的推荐def recommend_matrix_factorization(user_id, user_factors, item_factors, n_recommendations=5):"""基于矩阵分解的推荐"""if user_id >= user_factors.shape[0]:return # 获取用户因子user_factor = user_factors[user_id]# 计算预测评分predictions = user_factor @ item_factors# 获取用户已评分的物品user_ratings = rating_matrix.iloc[user_id]rated_items = user_ratings[user_ratings > 0].index# 过滤已评分的物品unrated_predictions = for item_id in range(len(predictions)):if item_id not in rated_items:unrated_predictions.append((item_id, predictions[item_id]))# 返回推荐结果recommendations = sorted(unrated_predictions, key=lambda x: x[1], reverse=True)return recommendations[:n_recommendations]# 4. 比较不同方法的推荐结果example_user = 0# NMF推荐nmf_recommendations = recommend_matrix_factorization(example_user, user_factors, item_factors, 5)# SVD推荐svd_recommendations = recommend_matrix_factorization(example_user, user_factors_svd, item_factors_svd, 5)print(f"\n矩阵分解推荐结果 (用户{example_user}):")print("NMF推荐:")for item_id, score in nmf_recommendations:item_name = rec_data['items_df'].loc[item_id, 'name']category = rec_data['items_df'].loc[item_id, 'category']print(f" 物品{item_id} ({item_name}, {category}): 预测评分 {score:.3f}")print("\nSVD推荐:")for item_id, score in svd_recommendations:item_name = rec_data['items_df'].loc[item_id, 'name']category = rec_data['items_df'].loc[item_id, 'category']print(f" 物品{item_id} ({item_name}, {category}): 预测评分 {score:.3f}")# 5. 可视化因子矩阵fig, axes = plt.subplots(2, 2, figsize=(15, 10))# NMF用户因子热力图sns.heatmap(user_factors[:20, :10], ax=axes[0, 0], cmap='viridis')axes[0, 0].set_title('NMF用户因子矩阵 (前20用户, 前10因子)')axes[0, 0].set_xlabel('因子')axes[0, 0].set_ylabel('用户')# NMF物品因子热力图sns.heatmap(item_factors[:10, :20], ax=axes[0, 1], cmap='viridis')axes[0, 1].set_title('NMF物品因子矩阵 (前10因子, 前20物品)')axes[0, 1].set_xlabel('物品')axes[0, 1].set_ylabel('因子')# SVD用户因子热力图sns.heatmap(user_factors_svd[:20, :10], ax=axes[1, 0], cmap='coolwarm', center=0)axes[1, 0].set_title('SVD用户因子矩阵 (前20用户, 前10因子)')axes[1, 0].set_xlabel('因子')axes[1, 0].set_ylabel('用户')# SVD物品因子热力图sns.heatmap(item_factors_svd[:10, :20], ax=axes[1, 1], cmap='coolwarm', center=0)axes[1, 1].set_title('SVD物品因子矩阵 (前10因子, 前20物品)')axes[1, 1].set_xlabel('物品')axes[1, 1].set_ylabel('因子')plt.tight_layoutplt.showreturn {'user_factors_nmf': user_factors,'item_factors_nmf': item_factors,'user_factors_svd': user_factors_svd,'item_factors_svd': item_factors_svd,'nmf_recommendations': nmf_recommendations,'svd_recommendations': svd_recommendations}# 运行矩阵分解示例matrix_factorization_results = matrix_factorizationdef recommendation_evaluation:"""推荐系统评估示例"""ratings_df = rec_data['ratings_df']rating_matrix = rec_data['rating_matrix']print("=== 推荐系统评估 ===")# 1. 数据分割def split_data(ratings_df, test_ratio=0.2):"""分割训练集和测试集"""np.random.seed(42)# 随机选择测试集test_indices = np.random.choice(len(ratings_df), size=int(len(ratings_df) * test_ratio), replace=False)test_data = ratings_df.iloc[test_indices].copytrain_data = ratings_df.drop(test_indices).copyreturn train_data, test_datatrain_data, test_data = split_data(ratings_df)print(f"训练集大小: {len(train_data)}")print(f"测试集大小: {len(test_data)}")# 2. 重新构建训练矩阵train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating')train_matrix = train_matrix.fillna(0)# 3. 评估函数def evaluate_recommendations(train_matrix, test_data, recommend_func, n_recommendations=10):"""评估推荐函数"""precisions = recalls = ndcgs = # 为每个测试用户评估test_users = test_data['user_id'].uniquefor user_id in test_users[:50]: # 限制用户数量以加快计算if user_id not in train_matrix.index:continue# 获取用户的实际测试评分user_test_items = test_data[test_data['user_id'] == user_id]actual_items = set(user_test_items['item_id'].tolist)if len(actual_items) == 0:continue# 获取推荐结果try:recommendations = recommend_func(user_id, n_recommendations)recommended_items = set([item_id for item_id, _ in recommendations])# 计算精确率和召回率hits = len(actual_items & recommended_items)precision = hits / len(recommended_items) if len(recommended_items) > 0 else 0recall = hits / len(actual_items) if len(actual_items) > 0 else 0precisions.append(precision)recalls.append(recall)# 计算NDCG (简化版本)dcg = 0idcg = 0for i, (item_id, _) in enumerate(recommendations):if item_id in actual_items:dcg += 1 / np.log2(i + 2)for i in range(min(len(actual_items), len(recommendations))):idcg += 1 / np.log2(i + 2)ndcg = dcg / idcg if idcg > 0 else 0ndcgs.append(ndcg)except:continuereturn {'precision': np.mean(precisions),'recall': np.mean(recalls),'ndcg': np.mean(ndcgs),'coverage': len(precisions)}# 4. 定义推荐函数def user_cf_recommend(user_id, n_recommendations=10):"""基于用户的协同过滤推荐"""if user_id not in train_matrix.index:return # 计算用户相似度user_similarity = cosine_similarity(train_matrix)user_similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)# 获取用户评分user_ratings = train_matrix.loc[user_id]unrated_items = user_ratings[user_ratings == 0].indexif len(unrated_items) == 0:return # 计算预测评分predictions = {}similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:11]for item_id in unrated_items:numerator = 0denominator = 0for similar_user_id, similarity in similar_users.items:if similarity > 0:similar_user_rating = train_matrix.loc[similar_user_id, item_id]if similar_user_rating > 0:numerator += similarity * similar_user_ratingdenominator += similarityif denominator > 0:predictions[item_id] = numerator / denominatorrecommendations = sorted(predictions.items, key=lambda x: x[1], reverse=True)return recommendations[:n_recommendations]# 5. 评估不同方法print("评估推荐系统性能...")# 基于用户的协同过滤user_cf_results = evaluate_recommendations(train_matrix, test_data, user_cf_recommend)print(f"\n基于用户的协同过滤评估结果:")print(f"精确率: {user_cf_results['precision']:.3f}")print(f"召回率: {user_cf_results['recall']:.3f}")print(f"NDCG: {user_cf_results['ndcg']:.3f}")print(f"覆盖用户数: {user_cf_results['coverage']}")# 6. 可视化评估结果metrics = ['精确率', '召回率', 'NDCG']values = [user_cf_results['precision'], user_cf_results['recall'], user_cf_results['ndcg']]plt.figure(figsize=(10, 6))bars = plt.bar(metrics, values, color=['skyblue', 'lightgreen', 'lightcoral'])# 添加数值标签for bar, value in zip(bars, values):plt.text(bar.get_x + bar.get_width/2, bar.get_height + 0.01,f'{value:.3f}', ha='center', va='bottom')plt.title('推荐系统评估指标')plt.ylabel('指标值')plt.ylim(0, 1)plt.grid(True, alpha=0.3)plt.showreturn {'train_data': train_data,'test_data': test_data,'user_cf_results': user_cf_results}# 运行推荐系统评估示例evaluation_results = recommendation_evaluation

推荐系统是现代互联网应用的核心技术,掌握这些算法可以构建个性化推荐服务。

来源:码农牧场

相关推荐