目录
1.利用python爬虫,爬取某网站的坚果销售数据,保存到csv文件里
3.先将处理好的csv文件上传到虚拟机,进而导入到csv文件,在MySQL中建表并导入数据,为相关的查询做准备
1.利用python爬虫,爬取某网站的坚果销售数据,保存到csv文件里
import json
import pprint
import requests
import re
import csv
import random
import time
with open('taobao.csv','w',encoding='ANSI',newline='')as filename:
csvwriter=csv.DictWriter(filename,fieldnames=['标题','价格','购买人数','地点','店铺名称','店铺网址'])
csvwriter.writeheader()
for page in range(1,6):
url = f'https://s.taobao.com/search?q=%E5%9D%9A%E6%9E%9C&suggest=history_4&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=4&ntoffset=4&p4ppushleft=2%2C48&s={page * 44}'
time.sleep(random.randint(1,4))
headers = {
'cookie': 'cna=JRtIHEi/p1ACASdCqY3qAKSb; xlly_s=1; sgcookie=E1005kad2%2FEzLAUmGeQ%2F9nHN0mEMWMReM%2FPLr4c9McQijU%2BXLQBECImdNfX6UW4W%2F%2FiRYcQ%2BTJIBi0d8Qga7RZdfcjDnYZabJlaIGGsPfzyCF7LEWbn237uYfu7lNxvMARLa; uc3=nk2=3rRLEkTloeY%3D&lg2=W5iHLLyFOGW7aA%3D%3D&id2=UUphy%2FLnSwESFrdcHQ%3D%3D&vt3=F8dCsGO0QnRL7Jy6eAk%3D; lgc=%5Cu89E3%5Cu5FE7%5Cu4E0A%5Cu90AA; uc4=nk4=0%403P0r2A9OmGhnRSWl66WY%2BTyXjA%3D%3D&id4=0%40U2grEJWZ8N%2FGr5ALdvjvq1UfiKVgKnYi; tracknick=%5Cu89E3%5Cu5FE7%5Cu4E0A%5Cu90AA; _cc_=WqG3DMC9EA%3D%3D; thw=cn; _m_h5_tk=146c8428c3f864a19652c98f2f2a68fe_1688114715735; _m_h5_tk_enc=38be48995f170927e508803cbbb9920e; mt=ci=-1_0; t=dcae55510abc79106be72b7eb4f95b52; _tb_token_=eb3eb83df3513; uc1=cookie14=Uoe8gqFe4ltWug%3D%3D; cookie2=221bb94bc4df83cff1e1652affd4d6e5; JSESSIONID=D6D9AFCA50BBA1768988814138520995; tfstk=dIkWH6sFBab5DvaIqTtqG4OCH_2B049NPMZKjDBPv8e8JjiiRz-HZ8lbRq3byT-kZWUIr8FnayzzRJin5n8wQdooqJVp7FJNZxmuK7Y20ZJZq02nuCg2BdrITBSAfEs1UZtK4dWfHOEExWTdz1fihoL3W09RLoMbV7at2xQfQqWa5Fh1OiNGdoawcn1htfHivpf..; l=fBx2WJEnN2n2ihEoBO5Bhurza77tNIOb4PVzaNbMiIEGa69F9LFVoNC1LX4XWdtjgTCjfetrl5ZFzdLHR3AmiNAJz3h2q_omFxvOhocfR; isg=BG1tOhv09VIKUpE-L5F5hyvofAnnyqGcZ4ZI1a9zjITSJo3YdxtfbTH0FPrAo7lU',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}
response = requests.get(url=url, headers=headers)
print(response.text)
html_data = re.findall('g_page_config = (.*);', response.text)[0]
json_data = json.loads(html_data)
pprint.pprint(json_data)
data = json_data['mods']['itemlist']['data']['auctions']
for index in data:
dict = {
'标题': index['raw_title'],
'价格': index['view_price'],
'购买人数': index['view_sales'],
'地点': index['item_loc'],
'店铺名称': index['shopName'],
'店铺网址': index['shopLink']
}
csvwriter.writerow(dict)
print(dict)
print('爬虫完成')
2.利用爬虫并存储到csv文件的数据,进行数据筛选处理
#数据单位化简
# import sklearn.decomposition as dp
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# import matplotlib.pyplot as plt
# data=pd.read_csv(r'C:\Users\86138\Desktop\坚果课设\taobao3.csv',encoding='gbk')
#
# data['购买人数'].loc[0].replace('人付款','')
# for i in range(0,1754):
# data['购买人数'].loc[i]=data['购买人数'].loc[i].replace('人付款','')
# print(data['购买人数'])
# for i in range(0,1754):
# data['购买人数'].loc[i]=data['购买人数'].loc[i].replace('万','0000')
# print(data['购买人数'])
# data.to_csv(r'C:\Users\86138\Desktop\坚果课设\处理后.csv', index=False)
import csv
import pandas as pd
# df=pd.read_csv('处理后.csv',encoding='gbk');
# df=df.drop(columns='店铺网址')
# for i in range(0,1754):
# df['购买人数'].loc[i]=df['购买人数'].loc[i].replace("+","")
# print(df.columns)
# df.to_csv('最终数据.csv',index=False)
#删除销量<1000的数据
df=pd.read_csv('最终数据.csv',encoding='gbk');
with open("最终数据.csv",'r',encoding='gbk') as load_file:
with open("筛选.csv",'w',encoding='gbk',newline='') as out_file:
ereader=csv.reader(load_file)
ewriter=csv.writer(out_file)
eheader=next(ereader)
ewriter.writerow(eheader)
for row_list in ereader:
sales = str(row_list[2])
if int(sales)>=1000:
ewriter.writerow(row_list)
#按照地点进行分类
#产地:上海
df=pd.read_csv('筛选.csv',encoding='gbk');
with open("筛选.csv",'r',encoding='gbk') as load_file:
with open("地区.csv",'w',encoding='gbk',newline='') as out_file:
ereader=csv.reader(load_file)
ewriter=csv.writer(out_file)
# eheader=next(ereader)
# ewriter.writerow(eheader)
for row_list in ereader:
area = str(row_list[3])
if area=="上海":
ewriter.writerow(row_list)
3.先将处理好的csv文件上传到虚拟机,进而导入到csv文件,在MySQL中建表并导入数据,为相关的查询做准备
利用filezilla把csv文件传输到虚拟机的data文件夹下
注意:利用filezilla上传的分别是数据.csv和最终数据.csv
显示一堆乱码:
windows系统里用的是gbk编码,linux里默认使用utf-8
就是把放上传到linux的cvs文件从gbk编码格式转成utf-8就行了
iconv -f gbk -t utf-8 最终数据.csv > 最终数据1.csv
//将本地文件上传到hdfs :
hadoop fs -put 最终数据1.csv /
//hadoop导出到mysql :
sqoop export --connect "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf-8" \
--username root --password root \
--export-dir '/最终数据1.csv' --table jg \
--input-fields-terminated-by ',' --driver com.mysql.jdbc.Driver
jg为提前建好的表:
create table jg(
name varchar(100),
price varchar(20),
sales int,
area varchar(30),
store varchar(50)
);
//建表jg2,挑选出销量大于1000的产品 :
create table jg2
select * from jg
where sales>=1000;
//建表jg3,按照销量排序 :
create table jg3
select * from jg2
group by name,area
order by sales desc;
4.最后利用javaee进行echarts可视化:
例如:
展示销量前十的产品:
<%@page pageEncoding="UTF-8" import="java.sql.*"%>
<!DOCTYPE html>
<html style="height: 100%">
<head>
<meta charset="utf-8">
<title>柱状图显示数值</title>
</head>
<body style="height:600px; margin: 0">
<div id="main" style="width: 1000px;height:60%;"></div>
<script type="text/javascript" src="js/echarts.min.js"></script>
<script>
function show(title,value){
var myChart = echarts.init(document.getElementById('main'));
// 指定图表的配置项和数据
var option = {
// 标题
title: {
text: 'ECharts 入门示例'
},
// 工具箱
toolbox: {
show: true,
feature: {
dataZoom: {
yAxisIndex: 'none'
},
dataView: {readOnly: false},
magicType: {type: ['line', 'bar']},
restore: {},
saveAsImage: {}
}
},
// 图例
legend: {
data: ['销量']
},
// x轴
xAxis: {
data: title ,
axisLabel: {
interval: 'auto',
rotate: -30 //设置倾斜角度,数值 可设置 正负 两种,
},
},
yAxis: {
type: 'value'
},
// 数据
series: [{
name: '销量',
type: 'bar',
data: value,
itemStyle: {
normal: {
label: {
show: true, //开启显示
position: 'top', //在上方显示
textStyle: { //数值样式
color: 'black',
fontSize: 16
}
}
}
},
}]
};
// 使用刚指定的配置项和数据显示图表。
myChart.setOption(option);
}
</script>
<%
Class.forName("com.mysql.jdbc.Driver");
String url="jdbc:mysql://192.168.198.200:3306/test";
Connection con=DriverManager.getConnection(url,"guest","guest");
String sql="select store,sales from jg2 order by sales desc limit 10";
PreparedStatement pst=con.prepareCall(sql);
ResultSet rs=pst.executeQuery();
%>
<script type="text/javascript">
title=new Array();
value=new Array();
<%
while(rs.next()){
%>
title.push("<%=rs.getString(1)%>");value.push(<%=rs.getInt(2)%>);
<%
}
rs.close();
pst.close();
con.close();
%>
show(title,value);
</script>
</body>
</html>
可视化结果:
其他可视化结果: