import urllib.request
from lxml import etree
import random
import json
# 全局变量
number = 0
text_dict = {}
att_dict = {}
# 访问下载页面
def download_page(url):
ua_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
]
ua = random.choice(ua_list)
headers = {
'Cookie': '',
'User-Agent': ua
}
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
tree = etree.HTML(content)
# 正文链接
src_zw_list = tree.xpath('//div[@id="yaosudiv"]/table//tr[2]/td[2]/a[1]/@href')
src_name_list = tree.xpath('//div[@id="yaosudiv"]/table/tbody/tr[1]/td/text()')
url_zw = src_zw_list[0]
src_name = src_name_list[0]
# 下载正文
tag = download_zw(url, url_zw, src_name)
# 附件链接和名字(有的没有附件)
href_fj_list = tree.xpath('//div[@id="yaosudiv"]/table/tbody/tr[4]/td[2]/div/a/@href')
name_fj_list = tree.xpath('//div[@id="yaosudiv"]/table/tbody/tr[4]/td[2]/div/a/text()')
# 下载附件(非空)
if href_fj_list:
for i in range(len(name_fj_list)):
href = href_fj_list[i]
name = name_fj_list[i]
download_fj(href, name, tag)
else:
print('attachment is empty!', url)
# 下载正文
def download_zw(url, url_zw, name):
global number
global text_dict
tag = 1
ua_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
]
ua = random.choice(ua_list)
headers = {
'Cookie': '',
'User-Agent': ua
}
try:
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read()
number = number + 1 # 记录条数
print(number, name, '正文下载完成!', url_zw)
filename = './download/' + name + '.docx'
with open(filename, 'wb+') as file:
file.write(content)
except:
tag = 0
text_dict[name] = url
with open("text_dict.json", "a+") as fp:
json.dump(text_dict, fp)
print("text url is error!", url)
return tag
# 下载附件
def download_fj(href, name, tag):
global att_dict
# 文件名后缀
#base, ext = os.path.splitext(name)
# 只下载附件的docx文件
#if ext != '.pdf':
if tag == 1:
url = 'http://oa.fynu.edu.cn' + href
ua_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
]
ua = random.choice(ua_list)
headers = {
'Cookie': '',
'User-Agent': ua
}
try:
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read()
print(name, '附件下载完成!', url)
filename = './download/' + name
with open(filename, 'wb') as file:
file.write(content)
except:
att_dict[name] = url
with open("att_dict.json", "a+") as fp:
json.dump(att_dict, fp)
print('attachment url error! ', url)
# 解析第一页href
def parse(content):
tree = etree.HTML(content)
href_list = tree.xpath('//div[@id="contentdiv"]//tr/td[3]//a/@href')
for i in range(len(href_list)):
href = href_list[i]
url = 'http://oa.fynu.edu.cn' + href
# 返回名字和第二页的网址
download_page(url)
# 获取第一页内容
def get_content(page):
url = 'http://oa.fynu.edu.cn/document/documentread/documentlist/all/all/' + str(page)
ua_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
]
ua = random.choice(ua_list)
headers = {
'Cookie': '‘,
'User-Agent': ua
}
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
return content
if __name__ == '__main__':
for page in range(0, 12640, 20):
print(f"page = {page}")
# 获取内容
content = get_content(page)
# 解析name, href
parse(content)
错误:容易请求中断
Scrapy框架:
number = 0
page_end = 0
class FyncSpider(scrapy.Spider):
name = "fync"
allowed_domains = ["url"]
# 开始
def start_requests(self):
for page in range(0, 12680, 20):
yield scrapy.Request(url)
# 解析第一页
def parse(self, response):
href_list = response.xpath('//div[@id="contentdiv"]//tr/td[3]//a/@href')
name_zw_list = response.xpath('//div[@id="contentdiv"]//tr/td[3]//a/text()')
for i in range(len(href_list)):
name_zw = name_zw_list[i].extract().split('\n')[0]
href = href_list[i].extract()
url_second = urljoin('http://oa.fynu.edu.cn', href)
# 访问第二页
yield scrapy.Request(url=url_second,
callback=self.second_parse,
meta= {'name_zw': name_zw}
)
# 解析第二页
def second_parse(self, response):
# 正文链接和名字
name_zw = response.meta['name_zw']
src_zw_list = response.xpath('//div[@id="yaosudiv"]/table//tr[2]/td[2]/a[1]/@href')
url_zw = src_zw_list[0].extract()
# 请求正文下载
yield scrapy.Request(url=url_zw, callback=self.download_zw, meta={'url': url_zw, 'name': name_zw})
# 附件链接和名字
href_fj_list = response.xpath('//div[@id="yaosudiv"]/table/tbody/tr[4]/td[2]/div/a/@href')
name_fj_list = response.xpath('//div[@id="yaosudiv"]/table/tbody/tr[4]/td[2]/div/a/text()')
# 下载附件(非空)
if href_fj_list:
for i in range(len(name_fj_list)):
href_fj = href_fj_list[i].extract()
name_fj = name_fj_list[i].extract()
url_fj = 'http://oa.fynu.edu.cn' + href_fj
# 请求附件下载
yield scrapy.Request(url = url_fj, callback=self.download_fj, meta={'url':url_fj, 'name':name_fj})
# 下载正文
def download_zw(self, response):
global number
number = number + 1
name = response.meta['name']
url = response.meta['url']
content = response.body
print(number, name, '正文下载完成!')
filename = './zw/' + name + '.docx'
try:
with open(filename, 'wb') as file:
file.write(content)
except:
print('text write error', url)
dict_zw = {name: url}
with open("text_dict_zw.json", "a", encoding='utf-8') as fp:
json.dump(dict_zw, fp, ensure_ascii=False, indent=4)
# 下载附件
def download_fj(self, response):
name = response.meta['name']
url = response.meta['url']
print(name, '附件下载完成!')
filename = './fj/' + name
try:
with open(filename, 'wb') as file:
file.write(response.body)
except:
print('attachment url error! ', url)
dict_fj = {name: url}
with open("dict_fj.json", "a", encoding='utf-8') as fp:
json.dump(dict_fj, fp, ensure_ascii=False, indent=4)
结果:
错误: 31条正文数据没写进来
服务器:
模型: glm3-6b-32k
from utils import load_model_on_gpus
from modelscope import AutoTokenizer, AutoModel, snapshot_download
tokenizer = AutoTokenizer.from_pretrained('/home/czy/project/GLM/chatglm3-6b-32k/', trust_remote_code=True)
model = load_model_on_gpus("/home/czy/project/GLM/chatglm3-6b-32k/", num_gpus=8)
model = model.eval()
response, history = model.chat(tokenizer, "请给我说一个笑话。", history=[])
print(response)
输出:
Langchain-chatchat:
服务器端口号映射到本地:在浏览器打开
连接超时: