MovieLens 1M数据集

这个数据集是电影评分数据:包括电影评分,电影元数据(风格类型,年代)以及关于用户的人口统计学数据(年龄,邮编,性别,职业等)。

MovieLens 1M数据集评分,用户信息,电影信息。

1
2
3
4
import pandas as pd
import chardet
import matplotlib.pyplot as plt
import numpy as np
1
2
# Make display smaller
pd.options.display.max_rows = 10
1
2
3
4
5
6
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('C:/Users/28696/Desktop/人工智能/ml-1m/users.dat', sep='::',
header=None, names=unames)

# 打印 users DataFrame 的前几行以验证数据加载是否成功
print(users.head())
   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455


C:\Users\28696\AppData\Local\Temp\ipykernel_21476\1541965289.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  users = pd.read_table('C:/Users/28696/Desktop/人工智能/ml-1m/users.dat', sep='::',

因为sep=’::’有点像是正则表达式,于是有了上面的错误。解决方法在这个设置engine为python即可。

1
2
users = pd.read_table('C:/Users/28696/Desktop/人工智能/ml-1m/users.dat', sep='::', 
header=None, names=unames, engine='python')
1
2
3
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('C:/Users/28696/Desktop/人工智能/ml-1m/ratings.dat', sep='::',
header=None, names=rnames, engine='python')
1
file_path = 'C:/Users/28696/Desktop/人工智能/ml-1m/movies.dat'
1
2
3
4
5
with open(file_path, 'rb') as f:
raw_data = f.read()
encoding = chardet.detect(raw_data)['encoding']

print(f'Detected encoding: {encoding}')
Detected encoding: ISO-8859-1
1
2
3
4
5
# 定义列名
mnames = ['movie_id', 'title', 'genres']

# 使用检测到的编码读取数据
movies = pd.read_table(file_path, sep='::', header=None, names=mnames, engine='python', encoding=encoding)
1
2
# 显示结果
print(genre_counts_for_rating_5)
Action        52187
Adventure     26608
Animation     10795
Children's    13574
Comedy        73530
              ...  
Romance       32059
Sci-Fi        32441
Thriller      41408
War           23423
Western        4965
Length: 18, dtype: int64
1
2
3
4
5
6
7
genre_counts_for_rating_5.plot(kind='bar', color='skyblue')
plt.title(f'Count of Movies with Rating {rating}')
plt.xlabel('Genre')
plt.ylabel(f'Count of Rating {rating}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

Cell In[64], line 2
      1 genre_counts_for_rating_5.plot(kind='bar', color='skyblue')
----> 2 plt.title(f'Count of Movies with Rating {rating}')
      3 plt.xlabel('Genre')
      4 plt.ylabel(f'Count of Rating {rating}')


NameError: name 'plt' is not defined

output 11 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 统计所有电影的评分数量
rating_counts = data['rating'].value_counts()

# 计算每个评分的占比
rating_counts_percentage = (rating_counts / rating_counts.sum()) * 100

# 定义颜色列表
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99','#c2c2f0']

# 可视化 - 饼状图
plt.figure(figsize=(10, 8)) # 设置图形大小

# 饼状图,包括百分比和数量
# 绘制饼图,包括百分比和数量
plt.pie(rating_counts, labels=[str(f'{rating}: {count} ({round(percentage, 1)}%)') for rating, count, percentage in zip(rating_counts.index, rating_counts.values, rating_counts_percentage.values)], autopct='%1.1f%%', startangle=140, colors=colors)

# 添加图表标题
plt.title('Movie Rating Distribution', fontsize=16, fontweight='bold', color='black')

# 显示图表
plt.axis('equal') # 保持饼状图为圆形
plt.show()


output 12 0

加载前几行验证一下数据加载工作是否顺利

1
users[:5]
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
1
ratings[:5]
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
1
movies[:5]
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

注意,年龄和职业是以编码形式给出的,它们的具体含义请参考改数据集的REAMDE文件。分析散布在三个表中的数据不是一件轻松的事情。假设我们想要根据性别和年龄来计算某部电影的平均得分,如果将所有的数据都合并到一个表中的话,问题就简单多了。我们先用pandas的merge函数将ratings和users合并到一起,然后再将movies也合并进去。pandas会根据列名的重叠情况推断出哪些列是合并(或连接)键:

1
data = pd.merge(pd.merge(ratings, users), movies)
1
data.head()
user_id movie_id rating timestamp gender age occupation zip title genres
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 1193 5 978298413 M 56 16 70072 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 1193 4 978220179 M 25 12 32793 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 1193 4 978199279 M 25 7 22903 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 1193 5 978158471 M 50 1 95350 One Flew Over the Cuckoo's Nest (1975) Drama
1
data.iloc[0]
user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

列出评分最高的十部电影

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import pandas as pd

# 定义文件路径
movies_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/movies.dat'
ratings_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/ratings.dat'

# 读取电影数据集
movies = pd.read_csv(movies_file, sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python', encoding='ISO-8859-1')

# 读取评分数据集
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python', encoding='ISO-8859-1')

# 合并电影和评分数据集,计算每部电影的平均评分
movie_ratings = pd.merge(movies, ratings, on='movieId')
average_ratings = movie_ratings.groupby('title')['rating'].mean().reset_index()

# 按平均评分降序排列并选择前10部电影
top_movies = average_ratings.sort_values(by='rating', ascending=False).head(10)

# 显示评分最高的十部电影
print(top_movies)
                                          title  rating
3477                    Ulysses (Ulisse) (1954)     5.0
2025                               Lured (1947)     5.0
1203                    Follow the Bitch (1998)     5.0
407                    Bittersweet Motel (2000)     5.0
3087                     Song of Freedom (1936)     5.0
2453                   One Little Indian (1973)     5.0
3044                       Smashing Time (1967)     5.0
2903  Schlafes Bruder (Brother of Sleep) (1995)     5.0
1297         Gate of Heavenly Peace, The (1995)     5.0
249                            Baby, The (1973)     5.0

根据任意个用户或电影属性对评分数据进行聚合操作了。按性别计算每部电影的平均得分,我们可以使用pivot_table方法:

1
2
mean_ratings = data.pivot_table('rating', index='title',
columns='gender', aggfunc='mean')
1
mean_ratings[:5]
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024

该操作产生了另一个DataFrame,其内容为电影平均得分,行标为电影名称,列表为性别。现在,我们打算过滤掉评分数据不够250条的电影(这个数字可以自己设定)。为了达到这个目的,我们先对title进行分组,然后利用size()得到一个含有各电影分组大小的Series对象:

1
ratings_by_title = data.groupby('title').size()
1
ratings_by_title[:10]
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64
1
active_titles = ratings_by_title.index[ratings_by_title >= 250]
1
print(active_titles)
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

上面的active_titles中的电影,都是评论是大于250条以上的。我们可以用这些标题作为索引,从mean_ratings中选出这些评论大于250条的电影:

1
2
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
... ... ...
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086

1216 rows × 2 columns

想要查看女性观众喜欢的电影,可以按F列进行降序操作:

1
2
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]
gender F M
title
Close Shave, A (1995) 4.644444 4.473795
Wrong Trousers, The (1993) 4.588235 4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Schindler's List (1993) 4.562602 4.491415
Shawshank Redemption, The (1994) 4.539075 4.560625
Grand Day Out, A (1992) 4.537879 4.293255
To Kill a Mockingbird (1962) 4.536667 4.372611
Creature Comforts (1990) 4.513889 4.272277
Usual Suspects, The (1995) 4.513317 4.518248

统计喜剧类型电影的男女人数分布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import matplotlib.pyplot as plt

# 定义文件路径
movies_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/movies.dat'
ratings_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/ratings.dat'
users_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/users.dat'

# 读取评分数据集,指定分隔符为::,编码为ISO-8859-1
rnames = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=rnames, engine='python', encoding='ISO-8859-1')

# 读取用户数据集,指定分隔符为::,编码为ISO-8859-1
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv(users_file, sep='::', header=None, names=unames, engine='python', encoding='ISO-8859-1')

# 读取电影数据集,指定分隔符为::,编码为ISO-8859-1
movies = pd.read_csv(movies_file, sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python', encoding='ISO-8859-1')

# 筛选出类型为喜剧的电影
comedy_movies = movies[movies['genres'].str.contains('comedy', na=False, case=False)]

# 根据喜剧电影的ID筛选出评分数据
comedy_ratings = ratings[ratings['movieId'].isin(comedy_movies['movieId'])]

# 合并用户数据和喜剧电影评分数据,确保使用正确的列名
comedy_ratings_with_users = pd.merge(comedy_ratings, users, left_on='userId', right_on='user_id')

# 统计观看喜剧电影的用户性别分布
gender_distribution = comedy_ratings_with_users['gender'].value_counts()

# 可视化性别分布
plt.figure(figsize=(10, 6)) # 设置图表大小
bar_colors = ['#1f77b4', '#ff7f0e'] # 设置柱状图的颜色
plt.bar(gender_distribution.index, gender_distribution.values, color=bar_colors)
plt.title('Gender Distribution of Users Rating Comedy Movies', fontsize=16) # 设置标题和字体大小
plt.xlabel('Gender', fontsize=12) # 设置x轴标题和字体大小
plt.ylabel('Number of Users', fontsize=12) # 设置y轴标题和字体大小
plt.xticks(gender_distribution.index, ['Male', 'Female'], rotation=0) # 设置x轴标签并防止旋转
plt.grid(axis='y', linestyle='--', alpha=0.7) # 添加y轴网格线,设置样式和透明度

# 显示图表
plt.tight_layout() # 自动调整子图参数,使之填充整个图表区域
plt.show()


output 36 0

统计观看喜剧电影的用户年龄分布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 统计观看喜剧电影的用户年龄分布
# 由于年龄数据是离散的,我们可以直接计数
age_distribution = comedy_ratings_with_users['age'].value_counts().sort_index()

# 可视化年龄分布
plt.figure(figsize=(12, 8)) # 设置图表大小
plt.bar(age_distribution.index, age_distribution.values, color='skyblue') # 使用单一颜色
plt.title('Age Distribution of Users Rating Comedy Movies', fontsize=16) # 设置标题和字体大小
plt.xlabel('Age', fontsize=12) # 设置x轴标题和字体大小
plt.ylabel('Number of Users', fontsize=12) # 设置y轴标题和字体大小
plt.xticks(age_distribution.index, age_distribution.index) # 设置x轴标签
plt.grid(axis='y', linestyle='--', alpha=0.7) # 添加y轴网格线,设置样式和透明度

# 显示图表
plt.tight_layout() # 自动调整子图参数,使之填充整个图表区域
plt.show()


output 38 0

统计观看喜剧类型电影的25岁用户的职业分布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import matplotlib.pyplot as plt

# 定义文件路径
movies_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/movies.dat'
ratings_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/ratings.dat'
users_file = 'C:/Users/28696/Desktop/人工智能/ml-1m/users.dat'

# 读取数据集
rnames = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_file, sep='::', header=None, names=rnames, engine='python', encoding='ISO-8859-1')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv(users_file, sep='::', header=None, names=unames, engine='python', encoding='ISO-8859-1')

movies = pd.read_csv(movies_file, sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python', encoding='ISO-8859-1')

# 筛选出类型为喜剧的电影
comedy_movies = movies[movies['genres'].str.contains('comedy', na=False, case=False)]

# 根据喜剧电影的ID筛选出评分数据
comedy_ratings = ratings[ratings['movieId'].isin(comedy_movies['movieId'])]

# 合并用户数据和喜剧电影评分数据
comedy_ratings_with_users = pd.merge(comedy_ratings, users, left_on='userId', right_on='user_id')

# 筛选出25岁用户观看的喜剧电影评分
age_25_comedy_ratings = comedy_ratings_with_users[(comedy_ratings_with_users['age'] == 25)]

# 统计25岁用户的职业分布
occupation_distribution = age_25_comedy_ratings['occupation'].value_counts()

# 可视化职业分布
plt.figure(figsize=(12, 8)) # 设置图表大小
plt.bar(occupation_distribution.index, occupation_distribution.values, color='skyblue', edgecolor='black', width=0.8)

plt.title('Occupation Distribution of 25-year-olds Rating Comedy Movies', fontsize=16) # 设置标题和字体大小
plt.xlabel('Occupation', fontsize=12) # 设置x轴标题和字体大小
plt.ylabel('Number of Users', fontsize=12) # 设置y轴标题和字体大小
plt.xticks(occupation_distribution.index, rotation=45) # 旋转x轴标签,使其更加清晰

# 添加网格线,提高可读性
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 自动调整子图参数,使之填充整个图表区域
plt.tight_layout()

# 显示图表
plt.show()


output 40 0

1、年龄分组:使用 pandas.cut 函数将 users 数据集中的用户年龄分为不同的组。

2、年龄段评分统计:使用 ratings 数据集,按照用户年龄分组统计每个年龄段的用户数量和平均评分。

3、被评价次数最多的电影:使用 ratings 数据集找出被评价次数最多的50部电影。

4、年龄段评分差异:对这些电影在不同年龄段的平均评分进行比较。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 假设 users 和 ratings 是已经加载的用户和评分数据集

# 用pandas.cut函数将用户年龄分组
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
users['age_group'] = pd.cut(users.age, bins=range(0, 81, 10), labels=labels)

# 合并用户数据和评分数据集时使用正确的列名
ratings_with_users = pd.merge(ratings, users[['user_id', 'age_group']], left_on='userId', right_on='user_id')

# 每个年龄段用户评分人数和打分偏好
age_rating_stats = ratings_with_users.groupby('age_group')['rating'].agg([np.size, np.mean])
print(age_rating_stats)

# 查找被评价过最多次的50部电影
top_rated_movies = ratings_with_users['movieId'].value_counts().head(50)

# 查看这些电影在不同年龄段之间的打分差异
rating_diff_by_age = ratings_with_users[ratings_with_users['movieId'].isin(top_rated_movies.index)].groupby(['age_group', 'movieId'])['rating'].mean().reset_index()

# 使用unstack函数将数据转换为一个表格
rating_diff_unstacked = rating_diff_by_age.pivot_table(index='movieId', columns='age_group', values='rating', fill_value=0)
print(rating_diff_unstacked.head(50))

# 可视化某部电影在不同年龄段的评分分布
example_movie_id = ratings_with_users[ratings_with_users['movieId'].isin(top_rated_movies.index)]['movieId'].iloc[0]
example_movie_ratings = rating_diff_by_age[rating_diff_by_age['movieId'] == example_movie_id]
plt.bar(example_movie_ratings['age_group'], example_movie_ratings['rating'], color='skyblue')
plt.title(f"Average Rating of Movie {example_movie_id} by Age Group")
plt.ylabel('Average Rating')
plt.xlabel('Age Group')
plt.show()
             size      mean
age_group                  
0-9         27211  3.549520
10-19      183536  3.507573
20-29      395556  3.545235
30-39      199003  3.618162
40-49      156123  3.673559
50-59       38780  3.766632
60-69           0       NaN
70-79           0       NaN
age_group       0-9     10-19     20-29     30-39     40-49     50-59
movieId                                                              
1          3.919643  4.017857  4.201266  4.302600  4.099602  3.886792
34         3.325843  3.609929  3.957813  4.023684  4.007067  3.948052
50         4.424242  4.680798  4.552430  4.390879  4.291866  4.431373
110        4.192982  4.406250  4.254292  4.152318  4.108635  3.969388
260        4.267327  4.427046  4.572695  4.354633  4.421397  4.250000
...             ...       ...       ...       ...       ...       ...
2916       3.441176  3.605978  3.651054  3.737500  3.820069  3.705882
2987       3.888889  3.716578  3.638607  3.685552  3.695817  3.666667
2997       4.090909  4.202083  4.091703  4.156171  4.064407  4.149425
3175       3.739130  3.656357  3.764253  3.830380  3.815094  3.864407
3578       4.203390  4.231441  4.084291  3.981651  4.147186  3.878788

[50 rows x 6 columns]

output 45 1

1 Measuring Rating Disagreement(计算评分分歧)

假设我们想要找出男性和女性观众分歧最大的电影。一个办法是给mean_ratings加上一个用于存放平均得分之差的列,并对其进行排序:

1
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

按‘diff’排序即可得到分歧最大且女性观众更喜欢的电影:

1
2
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:15]
gender F M diff
title
Dirty Dancing (1987) 3.790378 2.959596 -0.830782
Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359
Grease (1978) 3.975265 3.367041 -0.608224
Little Women (1994) 3.870588 3.321739 -0.548849
Steel Magnolias (1989) 3.901734 3.365957 -0.535777
... ... ... ...
French Kiss (1995) 3.535714 3.056962 -0.478752
Little Shop of Horrors, The (1960) 3.650000 3.179688 -0.470312
Guys and Dolls (1955) 4.051724 3.583333 -0.468391
Mary Poppins (1964) 4.197740 3.730594 -0.467147
Patch Adams (1998) 3.473282 3.008746 -0.464536

15 rows × 3 columns

对行进行反序操作,并取出前15行,得到的则是男性更喜欢,而女性评价较低的电影:

1
2
# Reverse order of rows, take first 10 rows
sorted_by_diff[::-1][:10]
gender F M diff
title
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Longest Day, The (1962) 3.411765 4.031447 0.619682
Cable Guy, The (1996) 2.250000 2.863787 0.613787
Evil Dead II (Dead By Dawn) (1987) 3.297297 3.909283 0.611985
Hidden, The (1987) 3.137931 3.745098 0.607167
Rocky III (1982) 2.361702 2.943503 0.581801
Caddyshack (1980) 3.396135 3.969737 0.573602
For a Few Dollars More (1965) 3.409091 3.953795 0.544704

如果只是想要找出分歧最大的电影(不考虑性别因素),则可以计算得分数据的方差或标准差:

1
2
# 根据电影名称分组的得分数据的标准差
rating_std_by_title = data.groupby('title')['rating'].std()
1
2
# 根据active_titles进行过滤
rating_std_by_title = rating_std_by_title.loc[active_titles]

不同性别间争议最大的电影

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
import matplotlib.pyplot as plt

# 定义文件路径
movies_file = "C:\\Users\\28696\\Desktop\\人工智能\\ml-1m\\movies.dat"
ratings_file = "C:\\Users\\28696\\Desktop\\人工智能\\ml-1m\\ratings.dat"
users_file = "C:\\Users\\28696\\Desktop\\人工智能\\ml-1m\\users.dat"

# 读取数据集时指定分隔符、列名和编码
# 此处省略了加载数据集的代码,假设数据集已正确加载

# 合并数据集的代码保持不变

# 重置索引并保留 movieId 和 title 列
lens.reset_index(inplace=True)

# 创建透视表,使用 'rating' 和 'gender' 列
pivoted = lens.pivot_table(index=['movieId', 'title'], columns='gender', values='rating', fill_value=0)

# 计算男女评分差异
pivoted['diff'] = pivoted['M'] - pivoted['F']

# 找出评分差异最大的50部电影的索引
most_50 = pivoted['diff'].abs().sort_values(ascending=False).index[:50]

# 绘制条形图,设置颜色为浅蓝色
disagreements = pivoted.loc[most_50, 'diff']
light_blue_color = '#ADD8E6' # 浅蓝色颜色代码
disagreements.sort_values().plot(kind='barh', figsize=(12, 15), color=light_blue_color)
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)', fontsize=16, fontweight='bold')
plt.ylabel('Movie', fontsize=14)
plt.xlabel('Average Rating Difference', fontsize=14)

# 自动调整子图参数,使之填充整个图表区域
plt.tight_layout()

# 显示图表
plt.show()


output 56 0