一、配置Django
1.1在项目创建app应用
python manage.py startapp myapp[app名称]
1.2 setting设置
(1)数据库
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql',
'NAME': "boss",
'USER': 'root',
'PASSWORD': 'xxxxx',
'HOST': 'localhost',
'PORT': 3306,
}
}
(2)static
#配置静态文件的设置
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR, 'templates')]
,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
STATIC_URL = '/static/'
STATICFILES_DIRS = (
os.path.join(BASE_DIR, 'static'),
)
(3) MIDDLEWARE
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
# 'django.middleware.csrf.CsrfViewMiddleware', #关闭csrf验证,若是不注释,可以在网页添加 {% csrf_token %},请求csrf验证
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'middleware.useInfoMid.UserMW'
]
(4)media
MEDIA_URL="/media/"
MEDIA_ROOT=os.path.join(BASE_DIR,'media')
(5)注册app
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'myapp'#注册app应用
]
1.3 路由设置
(1)子路由配置
from django.contrib import admin
from django.urls import path,include
from django.conf import settings
from django.conf.urls.static import static
urlpatterns = [
path('admin/', admin.site.urls),
#配置子路由
path('myapp/', include('myapp.urls')),
]
(2)配置media路由
urlpatterns +=static(settings.MEDIA_URL,document_root=settings.MEDIA_ROOT)
1.4在myapp下创建子路由
#文件结构
│ admin.py
│ apps.py
│ models.py
│ tests.py
│ urls.py #子路由
│ views.py
│ __init__.py
1.5数据模型
from django.db import models
#岗位表
class JobInfo(models.Model):
id=models.AutoField('id',primary_key=True)
title=models.CharField('岗位名称',max_length=255,default='')
address=models.CharField('省会',max_length=255,default='')
type=models.CharField('职业',max_length=255,default='')
edu=models.CharField('学历',max_length=255,default='')
workexp=models.CharField('工作经验',max_length=255,default='')
workTag=models.CharField('工作标签',max_length=255,default='')
salary=models.CharField('薪资',max_length=255,default='')
salaryMonth=models.CharField('年终奖',max_length=255,default='')
companyTags=models.CharField('公司福利',max_length=1000,default='')
hrWork=models.CharField('人士职位',max_length=255,default='')
hrName=models.CharField('人士名称',max_length=255,default='')
pratice=models.CharField('是否为实习岗位',max_length=255,default='')
companyTitle=models.CharField('公司名称',max_length=255,default='')
companyAvater=models.CharField('公司头像',max_length=255,default='')
companyNature=models.CharField('公司性质',max_length=255,default='')
companyStatus=models.CharField('公司状态',max_length=255,default='')
companyPeo=models.CharField('公司规模',max_length=255,default='')
detailUrl=models.CharField('详情页',max_length=500,default='')
companyUrl=models.CharField('公司详情页',max_length=500,default='')
createTime=models.DateField('创建时间',auto_now_add=True)
dist=models.CharField('行政区',max_length=255,default='')
#自定义表名
class Meta:
db_table='jonInfo'
#用户表
class User(models.Model):
id=models.AutoField('id',primary_key=True)
username=models.CharField('用户名',max_length=255,default='')
password=models.CharField('密码',max_length=255,default='')
edu=models.CharField('学历',max_length=255,default='')
workexp=models.CharField('工作经验',max_length=255,default='')
address=models.CharField('意向城市',max_length=255,default='')
work=models.CharField('意向岗位',max_length=255,default='')
avatar=models.FileField('头像',upload_to='avatar',default='avatar/default.png')
creareTime=models.DateField('创建时间',auto_now_add=True)
#自定义表名
class Meta:
db_table='user'
1.6生成数据库与数据迁移
python manage.py makemigrations
python manage.py migrate
二、确定并分析目标网页
2.1进入boss直聘网站,如图所示
https://www.zhipin.com/?city=100010000&ka=city-sites-100010000
2.2点击搜索Java,如图所示
2.3点击下一页观察URL变化
可发现网页URL变化规律:BOSS直聘(岗位)&city=100010000(全国)&page=2(页数)
2.4点击F12定位网页元素,如图所示
三、编写爬虫代码
3.1导入相关库,编写start函数
from selenium import webdriver
from datetime import datetime
import json
import os
import time
import schedule
from selenium import webdriver
import csv
import django
import pandas as pd
#配置Django运行环境
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "boss_test.settings")
django.setup()
def start():
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation']) #禁用自动化扩展插件
# options.add_argument('--headless') #隐藏浏览器界面
brower = webdriver.Chrome("D:\chromedriver_win32\chromedriver.exe", options=options)
return brower
3.2编写爬虫代码,将boss上岗位信息采集下来
def main(stype,startpage,endpage):
#声明全局变量
global type
#岗位类型
type= stype
brower = start()
for page in range(startpage,endpage):
print('正在爬取的页面路径:' + 'https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s' % (type, page))
brower.get('https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s' % (type,page))
time.sleep(10)
job_list = brower.find_elements(by=By.XPATH, value='//ul[@class="job-list-box"]/li')
#数据采集
for index, job in enumerate(job_list):
try:
data = []
#当前时间
date = datetime.now().strftime("%Y-%m-%d")
print('正在爬取第%d个数据' % (index + 1))
#岗位名称
title = job.find_element(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,"job-title")]/span[''@class="job-name"]').text
#总地址[市·区·地]
addresses = job.find_element(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,'
'"job-title")]/span[@class="job-area-wrapper"]/span').text.split('·')
#城市
address = addresses[0]
#行政区
if len(addresses) != 1:
dist = addresses[1]
else:
dist = ''
tag_list = job.find_elements(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,"job-info")]/ul[''@class="tag-list"]/li')
if len(tag_list) == 2:
#学历
edu = tag_list[1].text
#工作经验
workexp = tag_list[0].text
else:
edu = tag_list[2].text
workexp = tag_list[1].text
#hr名字
hrName = job.find_element(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,"job-info")]/div['
'@class="info-public"]').text
#hr工作
hrWork = job.find_element(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,"job-info")]/div[''@class="info-public"]/em').text
#工作标签
workTag = job.find_elements(by=By.XPATH,
value='./div[contains(@class,"job-card-footer")]/ul[@class="tag-list"]/li')
workTag = json.dumps(list(map(lambda x: x.text, workTag)))
#正常岗位
pratice = 0
#获取薪资
salaries = job.find_element(by=By.XPATH,
value='.//a[@class="job-card-left"]/div[contains(@class,'
'"job-info")]/span[@class="salary"]').text
#薪资、年薪处理
if salaries.find('K') != -1:
salaries = salaries.split('·')
if len(salaries) == 1:
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
salaryMonth = '0薪'
else:
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
salaryMonth = salaries[1]
else:
salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-')))
salaryMonth = '0薪'
#实习岗位
pratice = 1
#公司名称
companyTitle = job.find_element(by=By.XPATH,
value='.//div[@class="job-card-right"]/div[@class="company-info"]/h3/a').text
#公司头像
companyAvater = job.find_element(by=By.XPATH,
value='.//div[@class="job-card-right"]/div['
'@class="company-logo"]/a/img').get_attribute('src')
#公司信息
companyInfo = job.find_elements(by=By.XPATH,
value='.//div[@class="job-card-right"]/div[@class="company-info"]/ul[''@class="company-tag-list"]/li')
if len(companyInfo) == 3:
#公司性质
companyNature = companyInfo[0].text
#公司状态
companyStatus = companyInfo[1].text
#公司规模
companyPeo = companyInfo[2].text
if companyPeo != '10000人以上':
companyPeo = list(map(lambda x: int(x), companyInfo[2].text.replace('人', '').split('-')))
else:
companyPeo = [0, 10000]
else:
companyNature = companyInfo[0].text
companyStatus = '未融资'
companyPeo = companyInfo[1].text
if companyPeo != '10000人以上':
companyPeo = list(map(lambda x: int(x), companyInfo[1].text.replace('人', '').split('-')))
else:
companyPeo = [0, 10000]
#公司标签
companyTags = job.find_element(by=By.XPATH,
value='./div[contains(@class,"job-card-footer")]/div[@class="info-desc"]').text
if not companyTags:
companyTags = '无'
else:
companyTags = json.dumps((companyTags.split(',')))
#岗位详情页
detailUrl = job.find_element(by=By.XPATH, value='.//a[@class="job-card-left"]').get_attribute('href')
#公司详情页
companyUrl = job.find_element(by=By.XPATH,
value='.//div[@class="job-card-right"]/div[@class="company-info"]/h3/a').get_attribute('href')
data.append(title)
data.append(address)
data.append(type)
data.append(edu)
data.append(workexp)
data.append(workTag)
data.append(salary)
data.append(salaryMonth)
data.append(companyTags)
data.append(hrWork)
data.append(hrName)
data.append(pratice)
data.append(companyTitle)
data.append(companyAvater)
data.append(companyNature)
data.append(companyStatus)
data.append(companyPeo)
data.append(detailUrl)
data.append(companyUrl)
data.append(dist)
data.append(date)
#存储csv
save_csv(data)
except:
pass
3.3编写代码,创建csv文件,并以追加方式打开文件
def init():
if not os.path.exists('./xxxx.csv'):
with open('./xxxx.csv', 'a', newline='', encoding='utf8') as f:
writer = csv.writer(f)
writer.writerow(
['title', 'address', 'type', 'edu', 'workexp', 'workTag', 'salary', 'salaryMonth',
'companyTags', 'hrWork', 'hrName', 'pratice', 'companyTitle', 'companyAvater', 'companyNature',
'companyStatus', 'companyPeo', 'detailUrl', 'companyUrl', 'dist', 'date']
)
3.4将采集的数据存储至csv中
def save_csv(data):
with open('./xxxx.csv', 'a', encoding='utf8') as f:
writer = csv.writer(f)
writer.writerow(data)
3.5使用pandas读取csv中的数据,对数据进行预处理
def clear_csv():
df = pd.read_csv('.\xxxx.csv')
#去空
df.dropna(inplace=True)
print(df.shape[0])
#根据岗位详情页面去重
df['detailUrl'].duplicated()
df.drop_duplicates(['detailUrl'], keep=False, inplace=True)
print(df.shape[0])
#将年薪的进行格式替换
df['salaryMonth'] = df['salaryMonth'].map(lambda x: str(x).replace('薪', ''))
#按照爬取时间存入数据
df = df[df['date'].str.contains(datetime.now().strftime("%Y-%m-%d")) == True]
#筛选岗位类型
df = df[df['type'].str.contains(type) == True]
print(df.shape[0])
return df.values
3.6存储数据
def save_sql():
data = clear_csv()
for job in data:
JobInfo.objects.create(
title=job[0],
address=job[1],
type=job[2],
edu=job[3],
workexp=job[4],
workTag=job[5],
salary=job[6],
salaryMonth=job[7],
companyTags=job[8],
hrWork=job[9],
hrName=job[10],
pratice=job[11],
companyTitle=job[12],
companyAvater=job[13],
companyNature=job[14],
companyStatus=job[15],
companyPeo=job[16],
detailUrl=job[17],
companyUrl=job[18],
dist=job[19],
)
3.7运行函数
def Obj_main():
init()
main('xxxx',1,6) #岗位名称,起始页,结束页
save_sql()
JobInfo.objects.all()
3.8定时任务
def run():
schedule.every().friday.at("17:00").do(Obj_main)#每周五下午五点执行Obj_main
# schedule.every(1).minutes.do(Obj_main) #每隔一分钟执行Obj_main
while True:
schedule.run_pending()
time.sleep(1)