import random
import time
from pathlib import Path
import pandas as pd
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def get_element_text(element, css_selector):
"""
尝试查找元素并返回其文本内容,如果找不到则返回 None
:param element: 要查找元素的父元素
:param css_selector: CSS 选择器
:return: 元素的文本内容或 None
"""
try:
return element.find_element(By.CSS_SELECTOR, css_selector).text.strip()
except NoSuchElementException:
return None
def get_element_attribute(element, css_selector, attribute):
"""
尝试查找元素并返回其指定属性的值,如果找不到则返回 None
:param element: 要查找元素的父元素
:param css_selector: CSS 选择器
:param attribute: 要获取的属性名
:return: 元素的属性值或 None
"""
try:
return element.find_element(By.CSS_SELECTOR, css_selector).get_attribute(attribute)
except NoSuchElementException:
return None
def process_subject(subject, category_name):
"""
处理单个图书条目,提取相关信息
:param subject: 图书条目元素
:param category_name: 图书分类名称
:return: 包含图书信息的字典
"""
url = get_element_attribute(subject, '.pic > .nbg', 'href')
img_url = get_element_attribute(subject, '.pic > .nbg > img', 'src')
name = get_element_attribute(subject, '.info > h2 > a', 'title')
pub = get_element_text(subject, '.info > .pub')
rating = get_element_text(subject, '.info > .star > .rating_nums')
rating_count = get_element_text(subject, '.info > .star > .pl')
plot = get_element_text(subject, '.info > p')
buy_info = get_element_text(subject, '.info > .ft .buy-info > a')
data_dict = {
'category_name': category_name,
'url': url,
'img_url': img_url,
'name': name,
'pub': pub,
'rating': rating,
'rating_count': rating_count,
'plot': plot,
'buy_info': buy_info,
}
return data_dict
def process_category(driver, category_name, category_href):
"""
处理单个图书分类,遍历该分类下的所有页面并提取图书信息
:param driver: 浏览器驱动
:param category_name: 图书分类名称
:param category_href: 图书分类链接
"""
print(f"开始处理分类: {category_name},链接: {category_href}")
page = 0
while True:
file_dir = f'./原始数据层/图书分类数据集/'
file_name = f'{category_name}.csv'
file_path = Path(file_dir + file_name)
file_path.parent.mkdir(parents=True, exist_ok=True)
# 构建当前页面的 URL
url = category_href + f'?start={page * 20}&type=T'
print(f"正在访问页面: {url}")
driver.get(url)
time.sleep(random.uniform(1, 3))
try:
driver.find_element(By.CLASS_NAME, "subject-item")
except NoSuchElementException:
print(f"分类 {category_name} 页面加载完成,共处理 {page} 页")
break
# 等待所有图书条目元素加载完成
subject_list = WebDriverWait(driver, random.uniform(10, 20)).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "subject-item"))
)
data_list = []
for subject in subject_list:
data_dict = process_subject(subject, category_name)
data_list.append(data_dict)
page += 1
df1 = pd.DataFrame(data_list)
if file_path.exists():
df1.to_csv(file_path, mode='a', header=False, index=False)
else:
df1.to_csv(file_path, mode='w', header=True, index=False)
time.sleep(random.uniform(1, 3))
def main():
# 读取图书分类标签 CSV 文件
df = pd.read_csv('./原始数据层/图书分类标签.csv')
# 初始化浏览器驱动
driver = webdriver.Edge()
time.sleep(random.uniform(1, 3))
driver.get('https://book.douban.com/tag/小说')
time.sleep(60)
for _, category in df.iterrows():
category_name = category['name']
category_href = category['href']
file_dir = f'./原始数据层/图书分类数据集/'
file_name = f'{category_name}.csv'
file_path = Path(file_dir + file_name)
file_path.parent.mkdir(parents=True, exist_ok=True)
if file_path.exists():
print(f"文件已存在,跳过:{file_dir + file_name}")
continue
process_category(driver, category_name, category_href)
# 关闭浏览器驱动
driver.quit()
if __name__ == '__main__':
main()