【笔记4】用pandas实现条目数据格式的推荐算法 (基于用户的协同)

简介:
'''
基于用户的协同推荐

条目数据
'''

import pandas as pd
from io import StringIO
import json

#数据类型一:条目(用户、商品、打分)(避免巨型稀疏矩阵)
csv_txt = '''"Angelica","Blues Traveler",3.5
"Angelica","Broken Bells",2.0
"Angelica","Norah Jones",4.5
"Angelica","Phoenix",5.0
"Angelica","Slightly Stoopid",1.5
"Angelica","The Strokes",2.5
"Angelica","Vampire Weekend",2.0
"Bill","Blues Traveler",2.0
"Bill","Broken Bells",3.5
"Bill","Deadmau5",4.0
"Bill","Phoenix",2.0
"Bill","Slightly Stoopid",3.5
"Bill","Vampire Weekend",3.0
"Chan","Blues Traveler",5.0
"Chan","Broken Bells",1.0
"Chan","Deadmau5",1.0
"Chan","Norah Jones",3.0
"Chan","Phoenix",5,
"Chan","Slightly Stoopid",1.0
"Dan","Blues Traveler",3.0
"Dan","Broken Bells",4.0
"Dan","Deadmau5",4.5
"Dan","Phoenix",3.0
"Dan","Slightly Stoopid",4.5
"Dan","The Strokes",4.0
"Dan","Vampire Weekend",2.0
"Hailey","Broken Bells",4.0
"Hailey","Deadmau5",1.0
"Hailey","Norah Jones",4.0
"Hailey","The Strokes",4.0
"Hailey","Vampire Weekend",1.0
"Jordyn","Broken Bells",4.5
"Jordyn","Deadmau5",4.0
"Jordyn","Norah Jones",5.0
"Jordyn","Phoenix",5.0
"Jordyn","Slightly Stoopid",4.5
"Jordyn","The Strokes",4.0
"Jordyn","Vampire Weekend",4.0
"Sam","Blues Traveler",5.0
"Sam","Broken Bells",2.0
"Sam","Norah Jones",3.0
"Sam","Phoenix",5.0
"Sam","Slightly Stoopid",4.0
"Sam","The Strokes",5.0
"Veronica","Blues Traveler",3.0
"Veronica","Norah Jones",5.0
"Veronica","Phoenix",4.0
"Veronica","Slightly Stoopid",2.5
"Veronica","The Strokes",3.0'''

#数据类型二:json数据(用户、商品、打分)
json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5,
                      "The Strokes": 2.5, "Vampire Weekend": 2.0},
         
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
                 "Deadmau5": 4.0, "Phoenix": 2.0,
                 "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                  "Slightly Stoopid": 1.0},
         
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,
                     "Norah Jones": 5.0, "Phoenix": 5.0,
                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                     "Vampire Weekend": 4.0},
         
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,
                      "The Strokes": 3.0}
}'''


df = None

#方式一:加载csv数据
def load_csv_txt():
    global df
    df = pd.read_csv(StringIO(csv_txt), header=None, names=['user','goods','rate'])

#方式二:加载json数据(把json读成条目)
def load_json_txt():
    global df
    #由json数据得到字典
    users = json.loads(json_txt)
    
    #遍历字典,得到条目
    csv_txt_ = ''
    for user in users:
        for goods in users[user]:
            csv_txt_ += '{},{},{}\n'.format(user, goods, users[user][goods])
    
    df = pd.read_csv(StringIO(csv_txt_), header=None, names=['user','goods','rate'])


print('测试:读取数据')
#load_csv_txt()
load_json_txt()



def build_xy(user_name1, user_name2):
    df1 = df.ix[df['user'] == user_name1, ['goods','rate']]
    df2 = df.ix[df['user'] == user_name2, ['goods','rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='inner') #只保留两人都有评分的商品的评分
    
    return df3['rate_x'], df3['rate_y'] #merge之后默认的列名:rate_x,rate_y
    



#曼哈顿距离
def manhattan(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y))
    
#欧几里德距离
def euclidean(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    return sum((x - y)**2)**0.5
    
#闵可夫斯基距离
def minkowski(user_name1, user_name2, r):
    x, y = build_xy(user_name1, user_name2)
    return sum(abs(x - y)**r)**(1/r)
    
#皮尔逊相关系数
def pearson(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    mean1, mean2 = x.mean(), y.mean()
    #分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]

#余弦相似度(数据的稀疏性问题,在文本挖掘中应用得较多)
def cosine(user_name1, user_name2):
    x, y = build_xy(user_name1, user_name2)
    #分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    return [sum(x*y)/denominator, 0][denominator == 0]
    
metric_funcs = {
    'manhattan': manhattan,
    'euclidean': euclidean,
    'minkowski': minkowski,
    'pearson': pearson,
    'cosine': cosine
}


print('\n测试:计算Angelica与Bill的曼哈顿距离')
print(manhattan('Angelica','Bill'))


#计算最近的邻居(返回:pd.Series)
def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):
    '''
    metric: 度量函数
    k:      返回k个邻居
    r:      闵可夫斯基距离专用
    
    返回:pd.Series,其中index是邻居名称,values是距离
    '''
    array = df[df['user'] != user_name]['user'].unique()
    if metric in ['manhattan', 'euclidean']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)
    elif metric in ['minkowski']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return pd.Series(array, index=array.tolist()).apply(metric_funcs[metric], args=(user_name,)).nlargest(k)
    
    
print('\n测试:计算Hailey的最近邻居')
print(computeNearestNeighbor('Hailey'))


#向给定用户推荐(返回:pd.DataFrame)
def recommend(user_name):
    """返回推荐结果列表"""
    # 找到距离最近的用户名
    nearest_username = computeNearestNeighbor(user_name).index[0]
    
    # 找出这位用户评价过、但自己未曾评价的乐队
    df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
    df2 = df.ix[df['user'] == nearest_username, ['goods', 'rate']]
    
    df3 = pd.merge(df1, df2, on='goods', how='outer')
    
    return df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']].sort_values(by='rate_y')

    
print('\n测试:为Hailey做推荐')
print(recommend('Hailey'))


#向给定用户推荐(返回:pd.Series)
def recommend2(user_name, metric='pearson', k=3, n=5, r=2):
    '''
    metric: 度量函数
    k:      根据k个最近邻居,协同推荐
    r:      闵可夫斯基距离专用
    n:      推荐的商品数目
    
    返回:pd.Series,其中index是商品名称,values是加权评分
    '''
    # 找到距离最近的k个邻居
    nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)
    
    # 计算权值
    if metric in ['manhattan', 'euclidean', 'minkowski']: # 距离越小,越类似
        nearest_neighbors = 1 / nearest_neighbors # 所以,取倒数(或者别的减函数,如:y=2**-x)
    elif metric in ['pearson', 'cosine']:                 # 距离越大,越类似
        pass
        
    nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已经变为权值
    
    # 逐个邻居找出其评价过、但自己未曾评价的乐队(或商品)的评分,并乘以权值
    neighbors_rate_with_weight = []
    for neighbor_name in nearest_neighbors.index:
        # 每个结果:pd.Series,其中index是商品名称,values是评分(已乘权值)
        df1 = df.ix[df['user'] == user_name, ['goods', 'rate']]
        df2 = df.ix[df['user'] == neighbor_name, ['goods', 'rate']]
        
        df3 = pd.merge(df1, df2, on='goods', how='outer')
        
        df4 =  df3.ix[(df3['rate_x'].isnull()) & (df3['rate_y'].notnull()), ['goods', 'rate_y']]
        
        #注意这中间有一个转化为pd.Series的操作!
        neighbors_rate_with_weight.append(pd.Series(df4['rate_y'].tolist(), index=df4['goods']) * nearest_neighbors[neighbor_name])

    # 把邻居们的加权评分拼接成pd.DataFrame,按列累加,取最大的前n个商品的评分
    return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n) # 黑科技!
    
print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='manhattan', k=3, n=5))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))

print('\n测试:为Hailey做推荐')
print(recommend2('Hailey', metric='pearson', k=1, n=5))
本文转自罗兵博客园博客,原文链接:http://www.cnblogs.com/hhh5460/p/6121899.html ,如需转载请自行联系原作者
相关文章
|
3月前
|
算法
【❤️算法笔记❤️】-每日一刷-19、删除链表的倒数第 N个结点
【❤️算法笔记❤️】-每日一刷-19、删除链表的倒数第 N个结点
84 1
|
3月前
|
算法 索引
❤️算法笔记❤️-(每日一刷-141、环形链表)
❤️算法笔记❤️-(每日一刷-141、环形链表)
62 0
|
3月前
|
算法
【❤️算法笔记❤️】-(每日一刷-876、单链表的中点)
【❤️算法笔记❤️】-(每日一刷-876、单链表的中点)
61 0
|
3月前
|
算法
【❤️算法笔记❤️】-每日一刷-23、合并 K 个升序链表
【❤️算法笔记❤️】-每日一刷-23、合并 K 个升序链表
41 0
|
3月前
|
存储 算法
【❤️算法笔记❤️】-每日一刷-21、合并两个有序链表
【❤️算法笔记❤️】-每日一刷-21、合并两个有序链表
130 0
|
3月前
|
数据可视化 数据挖掘 数据处理
模型预测笔记(四):pandas_profiling生成数据报告
本文介绍了pandas_profiling库,它是一个Python工具,用于自动生成包含多种统计指标和可视化的详细HTML数据报告,支持大型数据集并允许自定义配置。安装命令为`pip install pandas_profiling`,使用示例代码`pfr = pandas_profiling.ProfileReport(data_train); pfr.to_file("./example.html")`。
79 1
|
3月前
|
算法 API 计算机视觉
人脸识别笔记(一):通过yuface调包(参数量54K更快更小更准的算法) 来实现人脸识别
本文介绍了YuNet系列人脸检测算法的优化和使用,包括YuNet-s和YuNet-n,以及通过yuface库和onnx在不同场景下实现人脸检测的方法。
111 1
|
3月前
|
JSON 算法 数据可视化
测试专项笔记(一): 通过算法能力接口返回的检测结果完成相关指标的计算(目标检测)
这篇文章是关于如何通过算法接口返回的目标检测结果来计算性能指标的笔记。它涵盖了任务描述、指标分析(包括TP、FP、FN、TN、精准率和召回率),接口处理,数据集处理,以及如何使用实用工具进行文件操作和数据可视化。文章还提供了一些Python代码示例,用于处理图像文件、转换数据格式以及计算目标检测的性能指标。
95 0
测试专项笔记(一): 通过算法能力接口返回的检测结果完成相关指标的计算(目标检测)
|
3月前
|
算法
❤️算法笔记❤️-(每日一刷-160、相交链表)
❤️算法笔记❤️-(每日一刷-160、相交链表)
28 1
|
3月前
|
数据可视化 搜索推荐 Python
Leecode 刷题笔记之可视化六大排序算法:冒泡、快速、归并、插入、选择、桶排序
这篇文章是关于LeetCode刷题笔记,主要介绍了六大排序算法(冒泡、快速、归并、插入、选择、桶排序)的Python实现及其可视化过程。
32 0