python爬取boss直聘某职位信息(selenium+chrome的应用)

最新推荐文章于 2025-05-16 13:56:48 发布

原创最新推荐文章于 2025-05-16 13:56:48 发布 · 3.5k 阅读

15 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #selenium

本文介绍使用Selenium爬取Boss直聘网站特定职位信息的方法，包括登录、数据抓取及CSV存储过程，解决模拟浏览器登录及多页面爬取难题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

主要是记录一下借助selenium实现数据获取。以boss直聘为目标网站，主要目的是爬取指定职位的相关信息，以csv形式存在了本地。

用模拟浏览器打开时，搜索某职位后，页面会跳至注册页面，需要登录后才能查看信息，这里事先存入cookies，打开首页，利用browser.add_cookie()附上事前存好的cookies，页面刷新后再执行步骤2。
cookies保存的方法：
我这里是用selenium打开登录页面后，手动登录，然后保存当前的cookies至本地

from selenium import webdriver
import json
 
browser = webdriver.Chrome()
browser.get('https://login.zhipin.com/?ka=header-login')
# 在弹出的模拟浏览器上手动登录
cookies = browser.get_cookies()
with open('boss_cookies.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(cookies))

待解决的问题

想直接利用selenium进行登录，但是在登录页面无法拖拽验证，看了好多资料也没解决·····欢迎留言解决方案，万分感谢~~~
已经尝试过的解决方案：
隐藏自己是模拟浏览器，详细可参考：一行js代码识别Selenium+Webdriver及其应对方案

from selenium.webdriver import Chrome from selenium.webdriver import ChromeOptions

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation']) driver = Chrome(options=option)

我在找答案的过程中，开始发现连手动拖拽都会报错，后来发现是Chrome和ChromeDriver版本不相符造成的，后续安装还是升级版本时，一定要记得Chrome和ChromeDriver版本要一致。后来更新了版本后，至少手动拖拽是成功了的 ······

整体代码

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
import json, csv
import time
import random

def inquire_job(browser, job_name):
    # 输入需要查询的职位
    input_string = browser.find_element_by_css_selector('.ipt-search')
    input_string.send_keys(job_name)

    button = browser.find_element_by_class_name('btn')
    button.click()


def get_job_items(browser):
    # 解析当前页面职位
    items = browser.find_elements_by_xpath("//div[@class='job-list']//li")

    file = open('boss_job_items.csv', 'a', encoding='utf-8')
    fieldnames = ['job_title', 'job_url', 'salary', 'condition', 'company_title', 'company_url',
                 'company_info', 'publis_name']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    for item in items:
        result = {}
        job = item.find_element_by_class_name('info-primary')
        result['job_title'] = job.find_element_by_class_name('job-title').text
        result['job_url'] = job.find_element_by_css_selector('a').get_attribute('href')
        result['salary'] = job.find_element_by_class_name('red').text
        result['condition'] = job.find_element_by_css_selector('p').text
        # '杭州 下城区 朝晖1-3年大专'
        company = item.find_element_by_class_name('info-company')
        result['company_title'] = company.find_element_by_css_selector('a').text
        result['company_url'] = company.find_element_by_css_selector('a').get_attribute('href')
        result['company_info'] = company.find_element_by_css_selector('p').text

        publis = item.find_element_by_class_name('info-publis')
        result['publis_name'] = publis.find_element_by_class_name('name').text

        writer.writerow(result)

    file.close()


def get_next_page(browser):
    # 翻页操作
    try:
        pages = browser.find_element_by_class_name('page')
        next_page_url = pages.find_element_by_class_name('next').get_attribute('href')
        print(next_page_url)
        return next_page_url
    except NoSuchElementException:
        raise NoSuchElementException


def main(job_name):
    # 避免被识别出为模拟浏览器
    options = ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_argument('--headless')
    browser = webdriver.Chrome(options=options)
    
	# 获取本地cookies
    with open('boss_cookies.json', 'r') as file:
        data = file.read()
        cookies = json.loads(data)
	# 加载cookies后刷新页面
    browser.get('https://www.zhipin.com/')
    for i in cookies:
        browser.add_cookie(i)
    browser.refresh()

    while True:
        try:
            inquire_job(browser, job_name)
            time.sleep(1)
            get_job_items(browser)
            next_page_url = get_next_page(browser)
            time.sleep(random.uniform(1, 10))
            browser.get(next_page_url)
        except Exception:
            break

    browser.close()


if __name__ == '__main__':
    start_time = time.time()
    main('python爬虫')
    end_time = time.time()
    print(end_time - start_time)