Requests页面解析(练习)
import requests
from lxml import etree
from h_selenium.lagou_headers import HEADERS
import re
from time import sleep
from random import randint
def request_list_url():
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
data = {'first': 'false', 'pn': '1', 'kd': 'python'}
resp = requests.post(url=url, data=data, headers=HEADERS)
result_dict = resp.json()
# print(result)
positions_ids = result_dict['content']['positionResult']['result']
id_list = [i['positionId'] for i in positions_ids]
return id_list
def positions_html(url_id):
url = 'https://www.lagou.com/jobs/{}.html'.format(url_id)
resp = requests.get(url=url, headers=HEADERS)
text = resp.text
html = etree.HTML(text)
# print(text)
return html
def data_massage(html):
def format_tool(_str):
format_s = re.sub(r'[\s /]', '', _str).strip()
return format_s
title = html.cssselect('span.name')[0].text # 岗位名称
describe = html.cssselect('dd.job_request p span')
describe = [format_tool(i.text) for i in describe] # 岗位描述
salary, city, job_years, edu, = describe[:-1]
print(title)
print(salary, city, job_years, edu)
print('-'*30)
advantage = html.cssselect('dd.job-advantage p')[0].text # 职位诱惑
description_li = html.xpath('//dd[@class="job_bt"]//div//p/text()') # 岗位描述
list_format = ''
description_no_format = list_format.join(description_li)
description = format_tool(description_no_format)
address_li = html.xpath('//div[@class="work_addr"]//text()')[:-2] # 工作地址
address_no_format = list_format.join(address_li)
address = format_tool(address_no_format)
print(advantage,)
print('-'*30)
print(description)
print('-'*30)
print(address)
print('='*60+'\n')
def main():
id_list = request_list_url()
for url_id in id_list:
html = positions_html(url_id)
data_massage(html)
# break
sleep(randint(3, 9))
if __name__ == '__main__':
main()