scrapy创建项目我就不多说了
在这里创建数据库表得时候我没有使用外键,索引在这儿我是分开存储的
spider.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import QiqixiaoshuoItem, QiQiXiaoSshuoItem
class QqSpider(scrapy.Spider):
name = 'qq'
allowed_domains = ['qq717.com']
start_urls = ['https://www.qq717.com/']
def parse(self, response):
item = QiqixiaoshuoItem()
# 获取小说所有分类 目的:获取所有小说
classify_url = response.xpath("//div[@class='nav']//li/a/@href").extract()[1:-3]
for classify_title in classify_url:
url = 'https://www.qq717.com' + classify_title
yield scrapy.Request(url=url, callback=self.novel_fiction, dont_filter=True, meta={"each_url": url})
def novel_fiction(self, response):
each_url = response.meta['each_url']
print("正在请求{%s}***************************************", each_url)
# 获取每部小说的详情页链接
detail_url_list = response.xpath("//div[@class='l']//li/span[@class='s2']/a/@href").extract()
for detail_url in detail_url_list:
url = 'https://www.qq717.com' + detail_url
yield scrapy.Request(url=url, callback=self.detail_content_list, dont_filter=True, meta={"each_url": url})
def detail_content_list(self, response):
item = QiqixiaoshuoItem()
each_url = response.meta['each_url']
item['title'] = response.xpath("//div[@id='info']/h1/text()").extract_first()
item['author'] = response.xpath("//div[@id='info']/p[1]/text()").extract_first().split(":")[-1]
item['type'] = response.xpath("//div[@class='con_top']/a[2]/text()").extract_first()
item['state'] = response.xpath("//div[@id='info']/p[2]/text()").extract_first().split(":")[-1].split(",")[0]
item['intro'] = response.xpath("//div[@id='intro']/p/text()").extract_first()
item['cover'] = response.xpath("//div[@id='fmimg']/img/@src").extract_first()
item['novel_id'] = each_url.split("/")[-2]
yield item
# 获取小说所有章节的url 目的:获取小说所有章节的内容
section_url_list = response.xpath("//div[@id='list']//dd/a/@href").extract()
for section_url in section_url_list:
url = 'https://www.qq717.com' + section_url
yield scrapy.Request(url=url, callback=self.section_list, dont_filter=True, meta={"each_url": url})
def section_list(self, response):
item = QiQiXiaoSshuoItem()
each_url = response.meta['each_url']
item['section_title'] = response.xpath("//div[@class='bookname']/h1/text()").extract_first()
content_list = response.xpath("//div[@id='content']/text()").extract()
content_ = ''
for section in content_list:
content_ += section.replace("哽噺繓赽奇奇小説蛧|w~w~w.qq717.com", "更 新 最 快 | w~w~w.qy0218.com")\
.replace("哽噺繓赽奇奇小説蛧|w~w~w.qq717.com", "更 新 最 快 | w~w~w.qy0218.com")\
.replace("“", "").replace("”", "").replace("\u3000\u3000", "\3000\u3000\u3000").replace("\3000", "\n\n")
item['content'] = content_
item['section_id'] = each_url.split("/")[-1].split(".")[-2]
item['novel_id'] = each_url.split("/")[-2]
yield item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for qiqixiaoshuo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'qiqixiaoshuo'
SPIDER_MODULES = ['qiqixiaoshuo.spiders']
NEWSPIDER_MODULE = 'qiqixiaoshuo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'qiqixiaoshuo (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'qiqixiaoshuo.middlewares.QiqixiaoshuoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'qiqixiaoshuo.middlewares.QiqixiaoshuoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qiqixiaoshuo.pipelines.QiqixiaoshuoPipeline': 300,
}
MYSQL_HOST = "127.0.0.1"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWORD = "123456"
MYSQL_DB = "novel"
MYSQL_CHARSET = 'utf8'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QiqixiaoshuoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
type = scrapy.Field()
state = scrapy.Field()
intro = scrapy.Field()
cover = scrapy.Field()
novel_id = scrapy.Field()
class QiQiXiaoSshuoItem(scrapy.Item):
section_title = scrapy.Field()
content = scrapy.Field()
section_id = scrapy.Field()
novel_id = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.conf import settings
from MySQLdb.cursors import DictCursor
from twisted.enterprise import adbapi
class QiqixiaoshuoPipeline(object):
def __init__(self, dbpool):
# 初始化线程池对象
self.dbpool = dbpool
@classmethod
def from_crawler(cls, crawler):
args = dict(host=crawler.settings.get("MYSQL_HOST"), port=crawler.settings.get("MYSQL_PORT"),
user=crawler.settings.get("MYSQL_USER"), db=crawler.settings.get("MYSQL_DB"),
passwd=crawler.settings.get("MYSQL_PASSWORD"), charset=crawler.settings.get("MYSQL_CHARSET"),
cursorclass=DictCursor)
# 创建一个线程池对象
# 参数一:用于连接mysql数据库的驱动的名称
# 相当于同时创建了含有很多个线程池对象(游标或者数据库连接)
dbpool = adbapi.ConnectionPool("MySQLdb", **args)
print('链接成功**************************************************************', dbpool)
return cls(dbpool)
def insert_sql(self, cursor, item):
# inser_sql = "insert into books(title, author, type, state, intro, cover, novel_id)values ('%s', '%s'," \
# "'%s', '%s', '%s', '%s', '%s')on duplicate key update title=(title)" % (item['title'],
# item['author'],
# item['type'],
# item['state'],
# item['intro'],
# item['cover'],
# item['novel_id'])
inser_sql = "insert into section(section_title, content, section_id, novel_id)VALUES ('%s', '%s', '%s', '%s') on " \
"duplicate key update section_title=(section_title)" % (
item['section_title'], item['content'], item['section_id'], item['novel_id']
)
cursor.execute(inser_sql)
def process_item(self, item, spider):
"""
在线程池dbpool中通过runInteraction()函数,来实现异步插入数据的操作,runInteraction()会将inser_sql这个函数交给线程池中的某一个线程具体执行.
:param item:
:param spider:
:return:
"""
result = self.dbpool.runInteraction(self.insert_sql, item)
# 如果数据插入失败,会执行addErrback()内部的函数调用
result.addErrback(self.error_info)
def error_info(self, failure):
print("数据插入失败,原因是:", failure)
以上就是整站的所有代码,思路很重要:
1:获取所有小说分类的连接
2:获取所有分类中的每部小说的详情连接
3:获取你所需要的内容字段例如:title,author,type,intro,cover
4:获取每部小说章节连接
5:抓取章节内容