#--coding:utf-8 --
######## coding=‘gbk’
import requests
from bs4 import BeautifulSoup
#from lxml.html import fromstring, tostring
final_url = “https://www.readnovel.com”
url = “https://www.readnovel.com/free”
headers = headersheaders = {
‘Referer’: ‘https://www.readnovel.com/free/all’,
‘Connection’:‘keep-alive’,
‘Host’:‘www.readnovel.com’,
‘Cookie’:’_csrfToken=Zjv3RvsiSXrGRpwLaoJTTdiOs4g3rdgAUSOIaSkz; newstatisticUUID=1606046171_1058262578; qdrs=0%7C3%7C0%7C0%7C1; qdgd=1’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0’
}
class Spider(object):
def get_html(self):
response = requests.get(url,headers = headers)
response.encoding = ‘utf-8’
html=response.text
soup = BeautifulSoup(html,“html.parser”)
for i in soup.find_all(name='h4'):
for a in i.find_all("a",href=True):
if a.get_text(strip=True):
m = a['href']
l = a.string
n = 'https://www.readnovel.com' + m
#print(l,n)
self.get_url(l,n)
def get_url(self,l,n):
src_url = [n]
response = requests.get(n)
response.encoding = 'utf-8'
html=response.text
soup = BeautifulSoup(html,"html.parser")
for a in soup.find_all(name='p',attrs='btn'):
#for a in soup.find_all(name='a',attrs={'btn':'pink-btn J-getJumpUrl '}):
####### print(a)
for b in a.find_all(name=‘a’,attrs=‘pink-btn J-getJumpUrl’):
conan = b[‘href’]
vivian = ‘https:’ + conan
print(l,vivian)
spider = Spider()
spider.get_html()