python spider the latest free novels

#--coding:utf-8 --

######## coding=‘gbk’
import requests
from bs4 import BeautifulSoup

#from lxml.html import fromstring, tostring
final_url = “https://www.readnovel.com”
url = “https://www.readnovel.com/free”
headers = headersheaders = {
‘Referer’: ‘https://www.readnovel.com/free/all’,
‘Connection’:‘keep-alive’,
‘Host’:‘www.readnovel.com’,
‘Cookie’:’_csrfToken=Zjv3RvsiSXrGRpwLaoJTTdiOs4g3rdgAUSOIaSkz; newstatisticUUID=1606046171_1058262578; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0’
}

class Spider(object):
def get_html(self):
response = requests.get(url,headers = headers)
response.encoding = ‘utf-8’
html=response.text
soup = BeautifulSoup(html,“html.parser”)

    for i in soup.find_all(name='h4'):
        for a in i.find_all("a",href=True):
            if a.get_text(strip=True):
                m = a['href']
                l = a.string
                n = 'https://www.readnovel.com' + m
             #print(l,n)
                self.get_url(l,n)
def get_url(self,l,n):           
    src_url = [n]
    response = requests.get(n)
    response.encoding = 'utf-8'
    html=response.text
    soup = BeautifulSoup(html,"html.parser")
    for a in soup.find_all(name='p',attrs='btn'):
    #for a in soup.find_all(name='a',attrs={'btn':'pink-btn J-getJumpUrl '}):

####### print(a)
for b in a.find_all(name=‘a’,attrs=‘pink-btn J-getJumpUrl’):
conan = b[‘href’]
vivian = ‘https:’ + conan
print(l,vivian)

spider = Spider()
spider.get_html()

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值