import requests
import json
import time
from fake_useragent import UserAgent
import pandas
from lxml import etree
ua = UserAgent()
json_list = []
def json_to_excel(json_file, excel_file):
# 打开JSON文件
excel_list = []
with open(json_file, 'r', encoding='utf-8') as f:
json_to_python_all_data = json.load(f) # 读取JSON文件的所有内容,将读取的结果返回为python的格式
for json_to_python_data_dict in json_to_python_all_data: # JSON数据是一个列表,列表中有多个元素,元素是dict字典类型,for每次读出一个元素
array_list = [] # 此列表目的是临时将字典的数据转成列表的数据
for key, value in json_to_python_data_dict.items(): # 遍历字典返回(键, 值) 元组
array_list.append(value) # 在原来的临时列表末尾追加新的对象
excel_list.append(array_list) # 临时链表的对象添加完成,添加一次到表格的列表,可以理解成列表中添加了一个元素,元素是列表类型
pd = pandas.DataFrame(excel_list) # 创建一个对象
pd.to_excel(excel_file, sheet_name='Sheet1', index=False, header=None) # 将DataFrame对象写入到Excel工作表中,没有横纵坐标的数字编号
if __name__ == '__main__':
num = 1 # 页数
while num <= 666:
# 请求参数
url = 'https://www.cmef.com.cn/exhibitorlist/ExhibitorQuery'
data = {"exhibitionIds":["e9defafb-5d6b-42d2-b523-c2abad413c75","d2f77a00-c615-4cb6-85b4-3058f515c0ab","af920968-535d-4b73-aa41-f1d092077e34","5c096d71-c5b8-4432-a0dc-7f3175b25596","601156ff-abf5-439f-83df-8e5f9ef180f2","a20542d0-3f06-4163-bc4b-26f6cac7d4b3","667ed22f-b921-4910-87b1-6ed9575ce6b1","f7dc8fe5-c2ce-41ba-93df-80b1c632dac4","678f8c16-9474-4762-b012-9d5dad231e5d"],"categories":[],"text":"","pageSize":12,"pageIndex":num,"exhibitionMap":[{"Id":"e9defafb-5d6b-42d2-b523-c2abad413c75","IsShow":1},{"Id":"d2f77a00-c615-4cb6-85b4-3058f515c0ab","IsShow":0},{"Id":"af920968-535d-4b73-aa41-f1d092077e34","IsShow":0},{"Id":"5c096d71-c5b8-4432-a0dc-7f3175b25596","IsShow":1},{"Id":"601156ff-abf5-439f-83df-8e5f9ef180f2","IsShow":0},{"Id":"a20542d0-3f06-4163-bc4b-26f6cac7d4b3","IsShow":0},{"Id":"667ed22f-b921-4910-87b1-6ed9575ce6b1","IsShow":1},{"Id":"f7dc8fe5-c2ce-41ba-93df-80b1c632dac4","IsShow":0},{"Id":"678f8c16-9474-4762-b012-9d5dad231e5d","IsShow":0}]}
headers = {
'cookie': '1839d162ecb47855294207; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221839d162e1c6cb-0a9b1aa3e1e255-78565470-2073600-1839d162e1da35%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%221839d162e1c6cb-0a9b1aa3e1e255-78565470-2073600-1839d162e1da35%22%7D; Hm_lvt_a15b989525609a6596aad6539529f6a6=1666081914,1666143526,1666175451,1666231489; Hm_lpvt_a15b989525609a6596aad6539529f6a6={}'.format(str(time.time())[:10]),
'origin': 'https://www.cmef.com.cn',
'referer': 'https://www.cmef.com.cn/exhibitorlist?cid=18',
'user-agent': ua.random
}
res = requests.post(url, headers=headers, data=data).text
print(res)
html = etree.HTML(res)
company_list = html.xpath('//div[@class="exc-item-title inner"]//text()')
for simple_name in company_list:
json_dict = dict()
json_dict['公司名称'] = simple_name
json_list.append(json_dict) # 将字典放入列表中
print(simple_name)
print(f'------------------第{num}页数据已爬完------------------')
num += 1
time.sleep(1)
json.dump(json_list, open('data.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False) # 存json文件
json_to_excel("data.json", "data.xlsx")
python爬虫脚本-动态加载实现方法
最新推荐文章于 2025-06-09 16:16:49 发布