一、背景与动机
在日常招聘流程中,HR 或招聘运营常常需要从邮箱里下载海量简历,再逐份提取姓名、联系方式、意向岗位等关键信息,手工操作既繁琐又容易出错。为此,我实现了一个一体化的小工具“简历助手”,它可以自动连接 163 邮箱、批量下载简历附件、解析 PDF/DOCX 文档中的联系信息,并整合输出为标准化 CSV 表格。你可以直接使用打包好的可执行文件,也可以通过 Python 源码运行,满足不同场景的需求。
二、功能综述
- 邮件抓取:自动连接 163 邮箱,获取指定文件夹中的邮件列表,仅对含附件的邮件进行处理。
- 附件下载:批量下载附件,自动规避重名覆盖(会在文件名后追加递增序号)。
- 简历解析:支持 PDF 与 DOCX,优先使用 pdfplumber,回退到 PyPDF2 或 python-docx。
- 信息提取:从简历文本中解析电话、邮箱、姓名等基础联系信息,并结合邮件主题/正文的辅助信息。
- 数据整合与去重:将多来源信息整合为候选人实体,消除重复项,结构化存储。
- 一键导出:输出为 CSV 文件,字段包含姓名、性别、年龄、应聘职位、地点、学历、期望薪资、电话号码、邮箱地址。
- 图形界面:提供基于 Tkinter 的轻量 GUI,可视化配置与执行、查看日志与进度。
- 日志追踪:运行日志写入 mail_crawler.log,便于定位问题。
三、总体架构
程序采用“数据模型 + 邮件抓取 + 简历解析 + GUI + 导出”五段式架构。其中:
- 数据模型
选人信息数据类:
class CandidateInfo:
"""候选人信息"""
name: str # 姓名
gender: str # 性别
age: str # 年龄
location: str # 地点
education: str # 学历
experience: str # 工作经验
salary_expectation: str # 期望薪资
position: str # 应聘职位
mail_id: str # 邮件ID
extra_info: str = "" # 额外信息(如证书等)
phone: str = "" # 电话号码(从简历解析获得)
email: str = "" # 邮箱地址(从简历解析获得)
简历联系信息数据类:
class ResumeContactInfo:
"""简历联系信息"""
filename: str
phone_numbers: List[str]
email_addresses: List[str]
names: List[str]
text_content: str
简历解析器
- 文本抽取与字段识别:
class ResumeParser:
"""简历解析器"""
def __init__(self):
self.phone_pattern = re.compile(r'1[3-9]\d{9}')
# 邮箱正则表达式
self.email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', re.IGNORECASE)
def extract_text_from_pdf(self, file_path: str) -> str:
"""从PDF文件提取文本"""
text = ""
# 优先使用pdfplumber
if pdfplumber:
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
except Exception as e:
logger.warning(f"pdfplumber解析失败: {e}")
# 备用PyPDF2
if PyPDF2:
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.warning(f"PyPDF2解析失败: {e}")
return text
def extract_text_from_docx(self, file_path: str) -> str:
"""从DOCX文件提取文本"""
if not docx:
logger.warning("python-docx库未安装,无法解析DOCX文件")
return ""
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
logger.error(f"解析DOCX文件失败: {e}")
return ""
def extract_contact_info_from_text(self, text: str) -> Tuple[List[str], List[str], List[str]]:
"""从文本中提取联系信息和姓名"""
# 提取邮箱地址
email_addresses = list(set(self.email_pattern.findall(text)))
# 清理文本
cleaned_text = re.sub(r'[~\-_=+\*#$%^&()\[\]{}|\\<>?/`]', ' ', text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
# 提取电话号码
phone_numbers = list(set(self.phone_pattern.findall(cleaned_text)))
# 提取姓名(中文姓名模式)
name_patterns = [
r'姓\s*名[:::]\s*([\u4e00-\u9fa5]{2,4})',
r'姓名[:::]\s*([\u4e00-\u9fa5]{2,4})',
r'(?:^|\s)([\u4e00-\u9fa5]{2,4})(?=\s|$|,|。|:|:|\n)',
r'我是([\u4e00-\u9fa5]{2,4})',
r'本人([\u4e00-\u9fa5]{2,4})',
r'([\u4e00-\u9fa5]{2,4})\s*简历',
]
# 过滤词汇
filter_words = {
'先生', '女士', '小姐', '同学', '老师', '经理', '总监', '主管', '专员', '助理',
'工程师', '设计师', '开发', '测试', '产品', '运营', '市场', '销售',
'姓名', '应聘', '求职', '意向', '岗位', '职位', '公司', '部门',
'教育', '经历', '工作', '项目', '技能', '证书', '荣誉', '自我',
}
names = []
for pattern in name_patterns:
matches = re.findall(pattern, cleaned_text, re.MULTILINE)
for match in matches:
if len(match) >= 2 and match not in filter_words:
if re.match(r'^[\u4e00-\u9fa5]+$', match):
names.append(match)
# 去重
unique_names = []
for name in names:
if name not in unique_names:
unique_names.append(name)
return phone_numbers, email_addresses, unique_names
def parse_resume_file(self, file_path: str) -> Optional[ResumeContactInfo]:
"""解析简历文件"""
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"文件不存在: {file_path}")
return None
# 根据文件扩展名选择解析方法
ext = file_path.suffix.lower()
text = ""
if ext == '.pdf':
text = self.extract_text_from_pdf(str(file_path))
elif ext == '.docx':
text = self.extract_text_from_docx(str(file_path))
else:
logger.warning(f"不支持的文件格式: {ext}")
return None
if not text.strip():
logger.warning(f"无法从文件中提取文本: {file_path}")
return None
# 提取联系信息和姓名
phone_numbers, email_addresses, names = self.extract_contact_info_from_text(text)
return ResumeContactInfo(
filename=file_path.name,
phone_numbers=phone_numbers,
email_addresses=email_addresses,
names=names,
text_content=text
)
def process_downloaded_resumes(self, download_dir: str) -> List[ResumeContactInfo]:
"""处理下载目录中的所有简历文件"""
download_path = Path(download_dir)
if not download_path.exists():
logger.warning(f"下载目录不存在: {download_dir}")
return []
resume_contacts = []
supported_extensions = ['.pdf', '.docx']
for file_path in download_path.iterdir():
if file_path.is_file() and file_path.suffix.lower() in supported_extensions:
logger.info(f"正在解析简历文件: {file_path.name}")
contact_info = self.parse_resume_file(str(file_path))
if contact_info:
resume_contacts.append(contact_info)
return resume_contacts
邮箱爬虫
- 会话建立、列表抓取、附件下载
class Mail163Crawler:
"""163邮箱附件爬虫"""
def __init__(self, config: MailConfig):
"""初始化爬虫"""
self.config = config
self.session = requests.Session()
self._setup_session()
# 创建下载目录
Path(self.config.download_dir).mkdir(parents=True, exist_ok=True)
def _setup_session(self):
"""设置请求session"""
# 设置请求头
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'Accept': 'text/javascript',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://mail.163.com',
'Referer': f'https://mail.163.com/js6/main.jsp?sid={self.config.sid}&df=mail163_letter',
})
# 设置cookies
for name, value in self.config.cookies.items():
self.session.cookies.set(name, value)
def get_mail_list(self, folder_id: int = 1, page_size: int = 50) -> List[MailInfo]:
"""获取邮件列表"""
url = f"{self.config.base_url}?sid={self.config.sid}&func=mbox:listMessages"
data = {
'var': f'<?xml version="1.0"?><object><int name="fid">{folder_id}</int><string name="order">receivedDate</string><boolean name="desc">true</boolean><int name="limit">{page_size}</int><int name="start">0</int><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
}
try:
logger.info(f"正在获取邮件列表...")
response = self.session.post(url, data=data)
response.raise_for_status()
mails = self._parse_mail_list_response(response.text)
mails_with_attachments = [mail for mail in mails if mail.has_attachment]
logger.info(f"获取到 {len(mails)} 封邮件,其中 {len(mails_with_attachments)} 封有附件")
return mails_with_attachments
except Exception as e:
logger.error(f"获取邮件列表失败: {str(e)}")
return []
def _parse_mail_list_response(self, response_text: str) -> List[MailInfo]:
"""解析邮件列表响应"""
mails = []
try:
var_match = re.search(r"'var'\s*:\s*\[(.*?)\]", response_text, re.DOTALL)
if not var_match:
logger.warning("未找到邮件数据")
return mails
var_content = var_match.group(1)
mail_objects = self._split_mail_objects(var_content)
for mail_obj in mail_objects:
mail_info = self._parse_single_mail(mail_obj)
if mail_info:
mails.append(mail_info)
except Exception as e:
logger.error(f"解析邮件列表响应失败: {str(e)}")
return mails
def _split_mail_objects(self, var_content: str) -> List[str]:
"""分割邮件对象字符串"""
mail_objects = []
brace_count = 0
current_object = ""
for char in var_content:
current_object += char
if char == '{':
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0 and current_object.strip():
mail_objects.append(current_object.strip().rstrip(','))
current_object = ""
return mail_objects
def _parse_single_mail(self, mail_obj_str: str) -> Optional[MailInfo]:
"""解析单个邮件对象"""
try:
id_match = re.search(r"'id'\s*:\s*'([^']+)'", mail_obj_str)
subject_match = re.search(r"'subject'\s*:\s*'([^']+)'", mail_obj_str)
from_match = re.search(r"'from'\s*:\s*'([^']+)'", mail_obj_str)
to_match = re.search(r"'to'\s*:\s*'([^']+)'", mail_obj_str)
size_match = re.search(r"'size'\s*:\s*(\d+)", mail_obj_str)
attached_match = re.search(r"'attached'\s*:\s*(true|false)", mail_obj_str)
if not all([id_match, subject_match, from_match]):
return None
return MailInfo(
id=id_match.group(1),
subject=subject_match.group(1),
from_addr=from_match.group(1) if from_match else "",
to_addr=to_match.group(1) if to_match else "",
sent_date="",
has_attachment=attached_match.group(1) == 'true' if attached_match else False,
size=int(size_match.group(1)) if size_match else 0
)
except Exception as e:
logger.error(f"解析单个邮件失败: {str(e)}")
return None
def get_mail_detail(self, mail_id: str) -> Optional[str]:
"""获取邮件详情"""
url = f"{self.config.base_url}?sid={self.config.sid}&func=mbox:readMessage"
data = {
'var': f'<?xml version="1.0"?><object><string name="id">{mail_id}</string><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
}
try:
logger.info(f"正在获取邮件详情: {mail_id}")
time.sleep(self.config.request_delay)
response = self.session.post(url, data=data)
response.raise_for_status()
return response.text
except Exception as e:
logger.error(f"获取邮件详情失败 {mail_id}: {str(e)}")
return None
def extract_attachments(self, mail_detail: str, mail_id: str) -> List[AttachmentInfo]:
"""从邮件详情中提取附件信息"""
attachments = []
try:
attachments_match = re.search(r"'attachments'\s*:\s*\[(.*?)\]", mail_detail, re.DOTALL)
if attachments_match:
attachments_content = attachments_match.group(1)
attachment_objects = self._split_attachment_objects(attachments_content)
for att_obj in attachment_objects:
attachment_info = self._parse_single_attachment(att_obj, mail_id)
if attachment_info:
attachments.append(attachment_info)
logger.info(f"从邮件 {mail_id} 提取到 {len(attachments)} 个附件")
return attachments
except Exception as e:
logger.error(f"提取附件信息失败: {str(e)}")
return []
def _split_attachment_objects(self, attachments_content: str) -> List[str]:
"""分割附件对象字符串"""
attachment_objects = []
brace_count = 0
current_object = ""
for char in attachments_content:
current_object += char
if char == '{':
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0 and current_object.strip():
attachment_objects.append(current_object.strip().rstrip(','))
current_object = ""
return attachment_objects
def _parse_single_attachment(self, att_obj_str: str, mail_id: str) -> Optional[AttachmentInfo]:
"""解析单个附件对象"""
try:
id_match = re.search(r"'id'\s*:\s*(\d+)", att_obj_str)
filename_match = re.search(r"'filename'\s*:\s*'([^']+)'", att_obj_str)
content_type_match = re.search(r"'contentType'\s*:\s*'([^']+)'", att_obj_str)
size_match = re.search(r"'estimateSize'\s*:\s*(\d+)", att_obj_str)
if not all([id_match, filename_match]):
return None
part_id = id_match.group(1)
filename = filename_match.group(1)
content_type = content_type_match.group(1) if content_type_match else "application/octet-stream"
size = int(size_match.group(1)) if size_match else 0
download_url = self._build_attachment_url(mail_id, part_id)
return AttachmentInfo(
filename=filename,
content_type=content_type,
size=size,
download_url=download_url,
part_id=part_id
)
except Exception as e:
logger.error(f"解析附件对象失败: {str(e)}")
return None
def _build_attachment_url(self, mail_id: str, part_id: str) -> str:
"""构建附件下载URL"""
return f"https://mail.163.com/js6/read/readdata.jsp?sid={self.config.sid}&mid={mail_id}&part={part_id}&mode=download&l=read&action=download_attach"
def download_attachment(self, attachment: AttachmentInfo, save_dir: str = None) -> bool:
"""下载单个附件"""
if save_dir is None:
save_dir = self.config.download_dir
try:
logger.info(f"正在下载附件: {attachment.filename}")
time.sleep(self.config.request_delay)
response = self.session.get(attachment.download_url, stream=True)
response.raise_for_status()
real_filename = self._extract_filename_from_response(response, attachment.filename)
safe_filename = self._sanitize_filename(real_filename)
save_path = Path(save_dir) / safe_filename
# 如果文件已存在,添加序号
counter = 1
while save_path.exists():
name_parts = safe_filename.rsplit('.', 1)
if len(name_parts) == 2:
new_name = f"{name_parts[0]}_{counter}.{name_parts[1]}"
else:
new_name = f"{safe_filename}_{counter}"
save_path = Path(save_dir) / new_name
counter += 1
# 下载文件
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
logger.info(f"附件下载完成: {save_path}")
return True
except Exception as e:
logger.error(f"下载附件失败 {attachment.filename}: {str(e)}")
return False
def _extract_filename_from_response(self, response: requests.Response, fallback_name: str) -> str:
"""从HTTP响应头中提取真实文件名"""
try:
content_disposition = response.headers.get('Content-Disposition', '')
if 'filename=' in content_disposition:
filename_match = re.search(r'filename[*]?=([^;]+)', content_disposition)
if filename_match:
raw_filename = filename_match.group(1).strip('"\'')
if raw_filename.startswith('=?UTF8?B?'):
encoded_parts = re.findall(r'=\?UTF8\?B\?([^?]+)\?=', raw_filename)
decoded_name = ""
for part in encoded_parts:
try:
decoded_name += base64.b64decode(part).decode('utf-8')
except:
pass
if decoded_name:
return decoded_name
else:
return unquote(raw_filename)
except Exception as e:
logger.warning(f"提取文件名失败: {str(e)}")
return fallback_name
def _sanitize_filename(self, filename: str) -> str:
"""清理文件名,移除非法字符"""
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
if not filename:
filename = "unnamed_file"
return filename
def crawl_all_attachments(self, folder_id: int = 1, max_mails: int = 50) -> int:
"""爬取所有附件"""
logger.info("开始爬取163邮箱附件...")
mails = self.get_mail_list(folder_id, max_mails)
if not mails:
logger.warning("未找到包含附件的邮件")
return 0
downloaded_count = 0
for i, mail in enumerate(mails, 1):
logger.info(f"处理邮件 {i}/{len(mails)}: {mail.subject}")
mail_detail = self.get_mail_detail(mail.id)
if not mail_detail:
continue
attachments = self.extract_attachments(mail_detail, mail.id)
if not attachments:
logger.warning(f"邮件 {mail.id} 未找到附件")
continue
for attachment in attachments:
if self.download_attachment(attachment):
downloaded_count += 1
logger.info(f"附件爬取完成,共下载 {downloaded_count} 个附件")
return downloaded_count
四、核心流程拆解
1. 1.
连接 163 邮箱
- 使用 requests.Session 建立会话,设置浏览器指纹与 Referer。
- 通过 SID 与 Cookies 进行身份保持,避免频繁登录。
- 文件夹筛选只保留含附件的邮件,减少无效请求与解析开销。
2. 1.
附件信息提取与下载
- 解析邮件详情中的附件数组,逐个构建下载链接。
- 下载前对响应头的 Content-Disposition 做文件名解析,处理中文与 Base64 场景,若同名则自动追加序号避免覆盖。
- 下载分块写入,提升大型文件的稳定性。
3. 1.
简历文本抽取
- PDF:优先 pdfplumber 提取文本,失败回退到 PyPDF2。
- DOCX:使用 python-docx 读取段落文本。
- 不支持的格式会在日志中给出提示并跳过。
4. 1.
联系信息识别
- 电话:使用匹配大陆手机号码规则的正则识别。
- 邮箱:采用兼容大小写的宽松邮箱正则。
- 姓名:结合“姓名/本人/我是/简历”等多场景中文语义线索,配合过滤词做去噪。
- 清洗策略:对简历文本进行符号规整与多空格合并,以提升识别准确率。
5. 1.
候选人信息整合
- 将简历文件解析得到的联系信息,与来自邮件主题/正文或上下文推断的应聘职位等线索合并。
数据结构落地到:
class CandidateInfo:
"""候选人信息"""
name: str # 姓名
gender: str # 性别
age: str # 年龄
location: str # 地点
education: str # 学历
experience: str # 工作经验
salary_expectation: str # 期望薪资
position: str # 应聘职位
mail_id: str # 邮件ID
extra_info: str = "" # 额外信息(如证书等)
phone: str = "" # 电话号码(从简历解析获得)
email: str = "" # 邮箱地址(从简历解析获得)
- 字段覆盖姓名、性别、年龄、地点、学历、经验、期望薪资、应聘职位、手机、邮箱、额外信息等。
- 去重策略(示例思路):以“电话/邮箱 + 姓名”作为弱主键进行合并,避免重复记录(实现时机可结合你的数据来源与业务规则)。
6. 1.
CSV 导出
- 统一将候选人列表导出为 CSV。
- 表头包含:姓名、性别、年龄、应聘职位、地点、学历、期望薪资、电话号码、邮箱地址。
- 若某字段缺失则置空,保证列结构稳定,便于后续在 Excel/BI 工具中透视分析。
五、图形界面与用户体验
- GUI 采用 Tkinter 实现,适合快速本地运行,无需搭建 Web 服务。
- 可以在界面中输入 SID/Cookies、配置抓取范围并启动任务。
- 过程日志实时打到窗口与文件 mail_crawler.log,异常也会以弹窗/日志提示。
- 提供清晰的运行状态与结果导出路径提示,非技术用户也能轻松上手。
六、健壮性与工程实践
- 异常保护:对网络请求、解析过程、文件系统写入做 try/except 包裹,失败会记录日志并继续后续任务。
- 速率控制:通过 request_delay 控制抓取节奏,避免触发服务端限流。
- 文件名安全:统一对文件名做清洗,规避非法字符导致的保存失败。
- 可选依赖:pdfplumber/PyPDF2/python-docx/BeautifulSoup 均做了“可用即用”的降级处理,尽量保证在不同环境下的可运行性。