Day27-招聘网站爬虫
import requests, csv
from re import search
from json import loads
from threading import Thread
# headers = {
# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
# }
# url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=&u_atoken=adf13136-66a6-438b-9084-60321f9f2eee&u_asession=01dZTjxedE8HIGhBp80nRVha9FBk6MpBZDX2nX68KwBFbzd7HJWyktsNvMFHI7EhxVX0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K8AQi7kUQYiQfwHKX5Q9liLXSAqQczIGNtz27flH3K2Z2BkFo3NEHBv0PZUm6pbxQU&u_asig=0500pkkgPKvXrghAk8gHNczPTTQB7A7sWn9_I2eE73e_HRQu1-ufFbhUIOob6C0vwjfq5WC5d0L_WOaxWbY-4XcsDPQeT8y33oX2m-64U5kNHbnnx1W7rz7eAt4JJ3bejy2yS9tCAN8QuBKmeMBytYYnDfhhYDiiwgfxrEXd30S479JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzQ0gHYCYZKZctXyVP3bczkY3UZz8OXh3zTq-5OlgRcZQhCNwLPH1kfWdjBCA3eJlR-3h9VXwMyh6PgyDIVSG1W8Kkds33EqsKILzoyII03KfN-ZRqdeyIaYiB9G-UDNt8GFQ99wO58eEU8LrmO2DDMJmK7ciD3oihBMkyg5L6l7SmWspDxyAEEo4kbsryBKb9Q&u_aref=Q2JhZ4rMV755zEKUVWqMs6sYCbY%3D'
# response = requests.get(url, headers=headers)
#
# print(response.text)
def get_page_data(page):
# 1.获取网页数据
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers = {
'cookie': '_uab_collina=164990040167146971529129; guid=067647ab5d548b3971b0bfba1cad4df3; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; ps=needv%3D0; 51job=cuid%3D176855974%26%7C%26cusername%3DBcw%252BUGzoZgSTiSYGR36lUFU8mIXHbCvTjACtOvGskY0%253D%26%7C%26cpassword%3D%26%7C%26cname%3DUr1R%252F28qDI3ztjpgcfNZJA%253D%253D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0Z1sOJ9%252FwKQE%26%7C%26cconfirmkey%3D%25241%2524LtyC2JFi%2524EfTYJ0TfigL5FDVFFqmJ90%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D%25241%2524Kpx16vT2%2524MfkntQXuO09xEQzrrH6if.%26%7C%26to%3D324530beb5a2c018d4c686d958a028f0628459b3%26%7C%26; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; privacy=1656464056; slife=lastlogindate%3D20220629%26%7C%26; acw_tc=2f624a3b16564669166851998e4156508828a0a020766ad748916dbb793ea5; ssxmod_itna=eqmhY58K7K0Iq0dD=wB0xf2=1GkDG2xexQRMM4GXhjYDZDiqAPGhDC38z4D5T4bOD2uw3hQUi0v3sG7EngGefY0icXYD84i7DKqibDCqD1D3qDkW+Y9lODADi3DEDDmXDmqi8DIu=DfRftDtSU6DDU7RLLZAuhxD2o9iuU0S4DH+GGoA8+Y/hvYRGPKKGvqIDm5SmxKDEPNIGhtNYqNBOGoW0PdQhHDGSrdCB4xD; ssxmod_itna2=eqmhY58K7K0Iq0dD=wB0xf2=1GkDG2xexQRMqA6W4vxD/7RtDF2u7+FrkDDvHD6QGKYkX3Fl5b8t4tQgr+nte6BGQtp+E3=KWZmhKi=ZfgE5AHY7YUDx8kOqCb=sz+cUvUnRLwP7K+ATKjgwx72Gx0prtHr0QE6WxXAvit7nDcDexS6p5+auNH4mzVlg7KZ9InlTtup8k0POjQjarLfm7=4u6gCyi4OPFU3qCel40d7j8+rir=xQxP4wLi0euDpYCQGH9//RATnB2f9DCVW/i=GCkX1vaPuBercBjShpScwvnKPFGR4t8Qhyipd=h0D6+mhyQGnmq/pixKIF6YFZ+80D9ip4amMliLmIhGiIciBDwhloAoPvrpyDPcOrK824xiOSr17200IMmr1ObOAIRx++lGT4jVZYUhG3cDLQxnGNgmbUlquxYd8c0nbaOKcDjd6PvgTngf0n0G3I4=NPmFx9RaR=hYp8PeH4ig8pqmIHEav00hnehAtc6yLDvX=IzijMgqUxj8xvIij+BjUOBL0tW6GQuICp6L6eqCjkxBAj3fCd9nmWkRWDG2KD3C2XY8xj2hGwjw=RKb6ioUviYECK/n=lh2f0lW9w3U4nw5Wtry06o0qhU5CCXv/=fUjKDDFqD+x4xD==',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get(url, headers=headers)
# 让解码方式和网页的编码方式一致
response.encoding = response.apparent_encoding
# 解析数据
json_data = search(r'window.__SEARCH_RESULT__ = (.+?)</script>', response.text).group(1)
job_data = loads(json_data)
page_data = []
for job in job_data['engine_jds']:
job_name = job['job_name']
company_name = job['company_name']
money = job['providesalary_text']
workarea = job['workarea_text']
attribute_text = job['attribute_text']
experience = attribute_text[1]
education = attribute_text[-1]
page_data.append([job_name, money, workarea, experience, education, company_name])
writer.writerows(page_data)
if __name__ == '__main__':
writer = csv.writer(open('files/数据分析.csv', 'w', encoding='utf-8', newline=''))
writer.writerow(['工作名称', '薪资待遇', '工作地点', '经验', ' 学历', '公司'])
for x in range(1, 11):
t = Thread(target=get_page_data, args=(x,))
t.start()