import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
% matplotlib inline
df = pd. read_csv( 'Recommendation System.csv' )
df. head( )
0 50 5 881250949 0 0 172 5 881250949 1 0 133 1 881250949 2 196 242 3 881250949 3 186 302 3 891717742 4 22 377 1 878887116
df. columns = ( [ 'user_id' , 'item_id' , "rating" , "timestamp" ] )
df. head( )
user_id item_id rating timestamp 0 0 172 5 881250949 1 0 133 1 881250949 2 196 242 3 881250949 3 186 302 3 891717742 4 22 377 1 878887116
movie_titles = pd. read_csv( "Movie_Id_Titles" )
movie_titles. head( )
item_id title 0 1 Toy Story (1995) 1 2 GoldenEye (1995) 2 3 Four Rooms (1995) 3 4 Get Shorty (1995) 4 5 Copycat (1995)
df = pd. merge( df, movie_titles, on= 'item_id' )
df. sort_values( 'item_id' ) . head( )
user_id item_id rating timestamp title 3937 308 1 4 887736532 Toy Story (1995) 4247 893 1 5 874827725 Toy Story (1995) 4246 336 1 3 877759342 Toy Story (1995) 4245 303 1 5 879466966 Toy Story (1995) 4244 886 1 4 876031433 Toy Story (1995)
df. head( )
user_id item_id rating timestamp title 0 0 172 5 881250949 Empire Strikes Back, The (1980) 1 213 172 5 878955442 Empire Strikes Back, The (1980) 2 92 172 4 875653271 Empire Strikes Back, The (1980) 3 77 172 3 884752562 Empire Strikes Back, The (1980) 4 194 172 3 879521474 Empire Strikes Back, The (1980)
df. groupby( "title" ) [ 'rating' ] . mean( ) . sort_values( ascending= False ) . head( )
title
Marlene Dietrich: Shadow and Light (1996) 5.0
Prefontaine (1997) 5.0
Santa with Muscles (1996) 5.0
Star Kid (1997) 5.0
Someone Else's America (1995) 5.0
Name: rating, dtype: float64
df. groupby( "title" ) [ 'rating' ] . count( ) . sort_values( ascending= False ) . head( )
title
Star Wars (1977) 583
Contact (1997) 509
Fargo (1996) 508
Return of the Jedi (1983) 507
Liar Liar (1997) 485
Name: rating, dtype: int64
ratings = pd. DataFrame( df. groupby( "title" ) [ 'rating' ] . mean( ) )
ratings. head( )
rating title 'Til There Was You (1997) 2.333333 1-900 (1994) 2.600000 101 Dalmatians (1996) 2.908257 12 Angry Men (1957) 4.344000 187 (1997) 3.024390
ratings[ "rating_counts" ] = pd. DataFrame( df. groupby( "title" ) [ 'rating' ] . count( ) )
ratings. head( )
rating rating_counts title 'Til There Was You (1997) 2.333333 9 1-900 (1994) 2.600000 5 101 Dalmatians (1996) 2.908257 109 12 Angry Men (1957) 4.344000 125 187 (1997) 3.024390 41
seaborn的displot()集合了matplotlib的hist()与核函数估计kdeplot的功能,增加了rugplot分布观测条显示与利用scipy库fit拟合参数分布的新颖用途。
核密度估计是在概率论中用来估计未知的密度函数,属于非参数检验方法之一。由于核密度估计方法不利用有关数据分布的先验知识,对数据分布不附加任何假定,是一种从数据样本本身出发研究数据分布特征的方法,因而,在统计学理论和应用领域均受到高度的重视。
sns. distplot( ratings[ "rating_counts" ] )
<matplotlib.axes._subplots.AxesSubplot at 0x2a6bc5a3710>
sns. distplot( ratings[ "rating" ] , bins= 50 )
<matplotlib.axes._subplots.AxesSubplot at 0x2a6bcda9da0>
sns. distplot( ( ratings[ 'rating' ] * ratings[ 'rating_counts' ] ) )
<matplotlib.axes._subplots.AxesSubplot at 0x2a6bcec49b0>
plt. figure( figsize= ( 10 , 7 ) )
sns. jointplot( x= 'rating' , y= "rating_counts" , data= ratings, alpha= .5 )
df. head( )
user_id item_id rating timestamp title 0 0 172 5 881250949 Empire Strikes Back, The (1980) 1 213 172 5 878955442 Empire Strikes Back, The (1980) 2 92 172 4 875653271 Empire Strikes Back, The (1980) 3 77 172 3 884752562 Empire Strikes Back, The (1980) 4 194 172 3 879521474 Empire Strikes Back, The (1980)
数据透视表:
https://www.cnblogs.com/Yanjy-OnlyOne/p/11195621.html
movie_mat = df. pivot_table( values= 'rating' , index= 'user_id' , columns = 'title' )
movie_mat. head( )
title 'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996) 12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996) 20,000 Leagues Under the Sea (1954) 2001: A Space Odyssey (1968) 3 Ninjas: High Noon At Mega Mountain (1998) 39 Steps, The (1935) ... Yankee Zulu (1994) Year of the Horse (1997) You So Crazy (1994) Young Frankenstein (1974) Young Guns (1988) Young Guns II (1990) Young Poisoner's Handbook, The (1995) Zeus and Roxanne (1997) unknown Á köldum klaka (Cold Fever) (1994) user_id 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 NaN NaN 2.0 5.0 NaN NaN 3.0 4.0 NaN NaN ... NaN NaN NaN 5.0 3.0 NaN NaN NaN 4.0 NaN 2 NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 NaN NaN NaN NaN 2.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 rows × 1664 columns