爬取某些网页需要携带cookie才可以进行访问
携带cookie一般有两种方式:
手动设置cookie:
#在header中携带cookie
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
"cookie":""
}
自动获取cookie:
#使用requests模块自带的Session
# 爬取雪球网主页
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
# 实例化一个session对象
s = requests.Session()
first_url = 'https://xueqiu.com/'
# 通过session对象进行页面的请求
s.get(first_url,headers=headers)
# 雪球网热门咨询url
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=110605&size=15'
json_text = s.get(url,headers=headers).json()