首先使用的数据是movielens(用的是10m大小的这份,用户100万数据打分):
1、关于电影hot排行榜统计
import pandas as pd
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
links = pd.read_csv('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/ratings.dat',sep="::",names=column_names)
column_names1 = ['item_id', 'title', 'movietype']
movies = pd.read_csv('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/movies.dat',sep="::",names=column_names1)
movies.head(5)
df = pd.merge(links,movies, on="item_id")
df.head(5)
df = df.drop(columns=['timestamp'])
df.dropna()
df.shape
# # genres and their count
genre_labels = set()
for gen in df['movietype'].str.split('|').values:
genre_labels = genre_labels.union(set(gen))
for x in genre_labels:
print(x, len(df[df['movietype'].str.contains(x)].index))
# # top movies
top = df.groupby(['title'])['rating'].mean().sort_values(ascending=False)[:20] # top 20 movies based on ratings
df.groupby(['title'])['rating'].mean()
2、idtdf相识度推荐
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
movie_types = set()
kk = []
a = open('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/movies.dat')
for i in a:
# if linenum>5:
# break
# linenum +=1
# print (i)
iteam = i.strip().split('::')
movieid,title,movietype = iteam[0],iteam[1],iteam[2]
mm =''
for j in movietype.split('|'):
&nb