[Python下载CSDN博客]4. V3版本_使用正则表达式分析HTML(二)_h3.search(i[0]).group(2).strip() if h3.search(i[0]-CSDN博客

本文介绍了一个用于抓取CSDN博客文章内容及图片的Python爬虫程序，通过解析HTML源码来提取文章标题、正文和图片链接，并将其保存至本地。此外，还详细展示了如何使用正则表达式进行内容抽取。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

3.3 分析其中一篇文章的HTML

使用浏览器查看http://blog.csdn.net/bagboy_taobao_com/article/details/5582868 的HTML并保存为article.html (保存的格式必须为UTF8, 否则会乱码). 双击打开article.html, 可以正确显示. OK, 可以用文本打开分析

3.3.1 文章标题和内容的HTML

<div id="article_details" class="details">
    <div class="article_title">
		<span class="ico ico_type_Original"></span>
		<h3>
			<span class="link_title"><a href="/bagboy_taobao_com/article/details/5582868">
        递归目录的所有文件(文章标题)
			</a></span>
		</h3>
	</div>
	......
    
	<div id="article_content" class="article_content">
		文章的内容, 包括所有标签
	</div>
</div>

文章标题部分可以构造正则表达式<span class="link_title"><a href="/bagboy_taobao_com/article/details/.*</a></span>得到. 再构造正则表达式<span class="link_title"><a href="/bagboy_taobao_com/article/details/\d+">|</a></span>得到标题

文章内容部分可以构造正则表达式<div id="article_content" class="article_content">.*</div>得到.(这里有一个问题: 如果这个div之间嵌有另外替他的div, 那么这个正则表达式式不能正确工作的. 自己暂时不会构造有嵌入的正则表达式)

3.3.2 处理文章内容中的图片

有一些文章上传了一些图片, 这里要求下载到本地, 并对文章内容中图片的连接转成本地连接.

看如下面文章内容中的的HTML

<img src="https://img-blog.csdn.net/20131026083038921?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYmFnYm95X3Rhb2Jhb19jb20=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast" alt=""><br>
<img src="https://img-blog.csdn.net/20131026083043656?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYmFnYm95X3Rhb2Jhb19jb20=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast" alt=""><br>

(CSDN改了这个img标签的格式了?)

找到<div id="article_content" class="article_content">标签后, 再查找该便签中的所有img标签, 并提取图片的url, 由于<div id="article_content" class="article_content">还需要保存到本地, 所以img标签中url'也对应修改成本地的路径.

从img的html可以看到, 只要提取<img src="与" alt="">之间的数据即可. 所以构造正则表达式(?<=<img src=").*?(?=" alt="">) 即可. 而且需要替换成本地路径.

3.3.3 如何输出

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
# 获取博客文章
# File: GetArticle.py
import urllib2
import httplib
import re

class CHYGetArticle:
	def Parser(self, htmlStr, article):
		# 提取文章标题
		pattern = re.compile(r'<span class="link_title"><a href="/bagboy_taobao_com/article/details/.*</a></span>', re.S)
		result = pattern.findall(htmlStr)
		pattern = re.compile(r'<span class="link_title"><a href="/bagboy_taobao_com/article/details/\d+">|</a></span>', re.M)
		result = pattern.split(result[0])
		article[0] = result[1]
		article[0] = article[0].replace("\n\r", "")		# 这里必须要重新赋值
		article[0] = article[0].strip()					# 这里必须要重新赋值				
		
		# 提取文章的内容
		pattern = re.compile(r'<div id="article_content" class="article_content">.*?</div>', re.S)
		result = pattern.findall(htmlStr)
		article[1] = result[0]
		
		# 分析图片列表, 并把img标签的url替换为本地路径
		pattern = re.compile(r'(?<=<img src=").*?(?=" alt="">)', re.S)
		imgList = pattern.findall(article[1])

		i = 1
		for imgItem in imgList:
			img = str(i) + ".jpg"			
			article[2].append([imgItem, img])
			article[1] = article[1].replace(imgItem, img)	# 使用字符串简单替换
			i = i + 1
'''
# http://blog.csdn.net/bagboy_taobao_com/article/details/13090313
# 测试代码
if __name__ == '__main__':
	conn = httplib.HTTPConnection("blog.csdn.net")
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = "/bagboy_taobao_com/article/details/13090313", headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	my = CHYGetArticle()
	article = [None, None, []]
	my.Parser(htmlByte, article)
	f = open("data.html", "w")
	print >> f, '<html xmlns="http://www.w3.org/1999/xhtml">'
	print >> f, '<head><title>',
	print >> f, article[0],
	print >> f, '</title>'
	print >> f, '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
	print >> f, '</head>'
	print >> f, '<body>'
	print >> f, article[0], 			# print最后参数加一个"逗号", 这样就输出最后不换行
	print >> f, article[1]
	print >> f, '</body>'
	print >> f, '</html>'
	
	# 保存图片
	#for img in article[2]
		# 下载图片
'''

3.4 主程序

提取分类列表, 提取某分类列表和提取文章内容都实现, 现在把它们整合即可.

3.4.1 提取策略

1. 提取分类列表(或者存档列表), 每一类创建一个目录(目录名是分类名或者存档日期).

2. 提取每一类的文章.

3. 每一篇文章一个目录, 文章内容在该目录的article.txt文件中.

3.4.2 主程序代码

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
import os
import GetCategoryAndMonth
import GetArticleList
import GetArticle

import urllib2
import httplib

def GetTypeList(host, blogName, list, type):
	'''
	获取类型列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = "/" + blogName, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetCategoryAndMonth.CHYGetCategoryAndMonth()
	my.Parser(htmlByte, type, list)

def GetTypeArticleList(host, articleListUrl, list):
	'''
	获取一类型的文章列表
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleListUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticleList.CHYGetArticleList()
	my.Parser(htmlByte, list)

def GetArticleFun(host, articleUrl, article):
	'''
	获取文章内容
	'''
	conn = httplib.HTTPConnection(host)
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = articleUrl, headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	htmlByte = r1.read()				# 获得HTML
	htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
	my = GetArticle.CHYGetArticle()
	my.Parser(htmlByte, article)

def ValidFileName(fileName):
	validFileName = fileName.decode("utf8")
	validFileName = validFileName.replace("/", "");
	validFileName = validFileName.replace("?", "");
	validFileName = validFileName.replace(":", "");
	validFileName = validFileName.replace('"', "");
	validFileName = validFileName.replace("'", "");
	return validFileName
	
def DownImg(imgUrl, name):
	conn = httplib.HTTPConnection("img.blog.csdn.net")
	# 要模拟成IE发送, 否则CSDN不接受Python的请求
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    
	headersP = { 'User-Agent' : user_agent }
	conn.request(method = "GET", url = imgUrl.replace("https://img-blog.csdn.net", ""), headers = headersP)
	r1 = conn.getresponse()				# 获得响应
	data = r1.read()				# 获得HTML
	f = file(name,"wb")  
	f.write(data)  
	f.close() 
	
if __name__ == '__main__':
	# 创建一个目录
	host = "blog.csdn.net"
	blogName = "bagboy_taobao_com"
	blogDir = "F:" + os.sep + blogName     # F:\<blogName> 目录下
	os.mkdir(blogDir)
	# 获取分类列表
	listType = []
	GetTypeList(host, blogName, listType, 1)
	# 循环创建类型目录
	for listTypeItem in listType:
		typeDir = blogDir + os.sep + listTypeItem[1]
		os.mkdir(typeDir)
		listArticle = []
		GetTypeArticleList(host, listTypeItem[0], listArticle)
		for listArticleItem in listArticle:
			article = [None, None, []]	# 标题, 内容, 图片列表
			GetArticleFun(host, listArticleItem, article)
			articleDir = typeDir + os.sep + listArticleItem.replace("/" + blogName + "/article/details/", "") + "_" + ValidFileName(article[0])
			# 以文章的标题名为保存的文件名
			os.mkdir(articleDir)
			title = articleDir + os.sep + "article.html"
			# print(title)
			f = open(title, 'w');
			print >> f, '<html xmlns="http://www.w3.org/1999/xhtml">'
			print >> f, '<head><title>',
			print >> f, article[0],
			print >> f, '</title>'
			print >> f, '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
			print >> f, '</head>'
			print >> f, '<body>'
			print >> f, article[0],
			print >> f, article[1]
			print >> f, '</body>'
			print >> f, '</html>'
			
			# 提取图片
			for imgItem in article[2]:
				name = articleDir + os.sep + imgItem[1]
				DownImg(imgItem[0], name)

主程序基本与V2版本一样, 但也有一些改变, 就是有些地方需要添加encode, 有些不需要.