什么是selenium
Selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。
用selenium 模拟浏览器进行操作,能有效的反反爬虫。
这里用selenium+chrome爬取百聘和boss直聘。
爬取BOSS直聘
完整代码:
import json
import time #时间模块,主要是用.sleep防止访问过快导致ip被封
import xlrd #操作excel模块
import xlwt
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions
chrome=Chrome(executable_path='chromedriver.exe')#chrome 模块
urls = []
def get_job():
chrome.execute_script('var q=document.documentElement.scrollTop=500')
# 等待出现 class_name=listitem
ui.WebDriverWait(chrome, 60).until(
expected_conditions.visibility_of_all_elements_located((
By.CLASS_NAME, 'job-box'
))
)
items = chrome.find_elements(By.CSS_SELECTOR, '.job-list>ul>li')
for item in items:
time.sleep(1)
a=item.find_element(By.CSS_SELECTOR,'.job-name a').get_attribute('href')
urls.append(a)
chrome.execute_script('var q=document.documentElement.scrollTop=3500')
time.sleep(1)
if chrome.find_element_by_css_selector('.next'):
chrome.find_element_by_css_selector('.next').click()
if len(urls)<10:
get_job()
time.sleep(1)
else:
write_job(urls)
else:
return 0
def write_job(urls):
f = xlwt.Workbook()
sheet1 = f.add_sheet('python', cell_overwrite_ok=True)
i = 1
sheet1.write_merge(0, 0, 0, 2, '职位名称')
sheet1.write_merge(0, 0, 3, 5, '职位薪资')
sheet1.write_merge(0, 0, 6,