Python爬虫以及数据可视化分析(最新B站番剧排行榜数据)

原创已于 2023-06-05 20:52:00 修改 · 6.1k 阅读

158 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫 #开发语言

于 2021-12-28 17:11:45 首次发布

python 专栏收录该内容

1 篇文章

订阅专栏

该博客介绍了如何爬取B站番剧排行榜的数据，并进行信息解析和数据可视化分析。通过使用Python的requests、BeautifulSoup、pandas和matplotlib库，实现了对动漫名称、播放量和收藏数的抓取，并存储到Excel文件中，最后绘制了播放量的柱状图和收藏数的折线图进行展示。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本博客将会对B站番剧排行榜的数据进行信息爬取以及数据可视化分析，适应最新的B站网页结构
在这里插入图片描述
网页结构：

直接上代码，代码注释有步骤解释！

import requests
from bs4 import BeautifulSoup
import re
import pandas
import matplotlib.pyplot as plt
import matplotlib.font_manager
# 1、获取网页内容
def get_html(url):
    try:
        request = requests.get(url)             # 使用get来获取网页数据
        request.raise_for_status()              # 如果返回参数不为200，抛出异常
        request.encoding = request.apparent_encoding  # 获取网页编码方式
        return request.text                     # 返回获取的内容
    except:
        return '错误'

#2、信息解析存储
def save(html):
    soup = BeautifulSoup(html,'html.parser')
    # 定义好相关列表准备存储相关信息
    name = []  # 动漫名字
    play= []  # 播放量
    favorite= []  # 收藏数
    #动漫名字存储
    for tag in soup.find_all('div', class_='info'):
        # print(tag)
        bf = tag.a.string
        name.append(str(bf))
    print(name)
    #播放量存储
    for tag in soup.find_all('div', class_='detail-state'):
        # print(tag)
        bf = tag.find('span', class_='data-box').get_text()
        # 统一单位为‘万’
        if '亿' in bf:
            num = float(re.search(r'\d(.\d)?', bf).group()) * 10000
            # print(num)
            bf = num
        else:
            bf = re.search(r'\d*(\.)?\d', bf).group()
        play.append(float(bf))
    print(play)
    #收藏数
    for tag in soup.find_all('div', class_='detail-state'):
        sc = tag.find('span', class_='data-box').next_sibling.next_sibling.get_text()
        sc = re.search(r'\d*(\.)?\d', sc).group()
        favorite.append(float(sc))
    print(favorite)

    # 存储至excel表格中
    info = {'动漫名': name, '播放量(万)': play, '收藏数(万)': favorite}
    dm_file = pandas.DataFrame(info)
    dm_file.to_excel('Data.xlsx', sheet_name="动漫数据分析")
    # 将所有列表返回
    return name, play, favorite
#3、数据可视化分析
def view(info):
    #取出列表数据
    dm_name = info[0]  # 番剧名
    dm_play = info[1]  # 番剧播放量
    dm_favorite = info[2]  # 番剧收藏数

    #使图上的数据可以显示中文
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    #播放量柱形图，收藏数折线图
    fig, ax1 = plt.subplots()
    plt.bar(dm_name, dm_play, color='red')  #设置柱状图
    plt.title('番剧播放量与收藏数的数据分析')  # 表标题
    ax1.tick_params(labelsize=6)
    plt.xlabel('番剧名')  # 横轴名
    plt.ylabel('播放量')  # 纵轴名
    plt.xticks(rotation=90, color='#7b68ee')  # 设置横坐标变量名旋转度数和颜色

    #组合图必须加这个
    ax2 = ax1.twinx()
    ax2.plot(dm_favorite, color='yellow')  # 设置线粗细，节点样式
    plt.ylabel('收藏数（万）')
    plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
    plt.plot(1, label='播放量', color="red", linewidth=5.0)  # 图例
    plt.legend()
    plt.show()

def main():
    # 网址
    url = 'https://www.bilibili.com/v/popular/rank/bangumi'
    # 获取返回值
    html = get_html(url)
    print(html)
    info = save(html)
    view(info)
if __name__ == '__main__':
    main()

success！
在这里插入图片描述