爬取某数字网站公司信息
第一步
- 首先先分析页面url,爬虫最重要的就是获取页面的url,观察要爬取页面的url查看其规律找到其构成元素,一般都会有id、页面信息等
- 然后分析页面请求数据后返回的数据类型,有的页面信息直接在html代码中,有的返回的是json,根据不同情况分别解析便可
第二步
- 模仿浏览器访问,否则可能会被封ip,代码如下
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
String url = "https://\"www.hah.com\"";
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
第三步
- 解析页面:如果页面返回的是html代码,有两种方法解析
(1)、用jsoup获得html标签,找到你想获得的信息所在标签,可根据其class定位它,在分别获取属性值和内容便可,例:
//获取class中值为 ico bz-border的标签
Elements s3 = text.getElementsByAttributeValue("class", "ico bz-border");
//获取a标签
Elements a = text.getElementsByTag("a");
//获取a中的href属性值
String href = a.get(0).attr("href");
//获取a标签中的内容
String ka = a.get(1).text();
(2)、用正则表达式解析获得html标签,属性值等,可以分层获取解析,如果不能解析,可利用string API中的方法截取字符串等例:
//获取class中值为 ico bz-border的标签
Elements s3 = text.getElementsByAttributeValue("class", "ico bz-border");
//获取a标签
Elements a = text.getElementsByTag("a");
//获取a中的href属性值
String href = a.get(0).attr("href");
//获取a标签中的内容
String ka = a.get(1).text();
//用正则解析代码
String a = "<a[^>]*>([^<]*)</a>";
String aTag = "<a[^>]*>([^<]*)</a>";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
//判断地址是否规范
String judge = "false";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
//System.out.println("解析a为:"+allMessage);
//使用正则表达式
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
// System.out.println("解析doubleAddress为:"+doubleAddress);
//System.out.println("汉字长度为"+matcher.replaceAll(""));
//System.out.println("汉字为:"+doubleAddress);
if(doubleAddress.equals("搜职位")){
judge = "true";
}
}
- 解析页面:如果页面返回的是json,直接解析json即可
第四步
把数据存储数据库或导出文件即可
//导出文件
public class ExportExcel {
HSSFWorkbook workbook = new HSSFWorkbook();// 创建工作簿对象 中有多个sheet
//显示的导出表的标题
private String title;
//导出表的列名
private String[] rowName;
private List<List<Object[]>> list = new ArrayList<List<Object[]>>();
private List<Object[]> countList = new ArrayList<>();
private String[] categoryName;
//构造方法,传入要导出的数据
public ExportExcel(String title, String[] rowName, List<List<Object[]>> list, String[] categoryName, List<Object[]> countList) {
this.list = list;
this.rowName = rowName;
this.title = title;
this.categoryName = categoryName;
this.countList = countList;
}
public ExportExcel(String title, String[] rowName) {
this.rowName = rowName;
this.title = title;
}
public void exportPersonInfo2(String sheetTitle,List<TCompany> countList) throws Exception {
try {
HSSFSheet sheet = workbook.createSheet(sheetTitle);// 创建工作表
sheet.setColumnWidth(1, 4000);
sheet.setColumnWidth(2, 20000);
HSSFCellStyle style = this.getStyle(workbook);
HSSFCellStyle numberStyle = this.getNumberStype(workbook);
HSSFCellStyle headStyle = this.getColumnTopStyle(workbook);
int columnNum = rowName.length;
HSSFRow rowRowName = sheet.createRow(2); // 在索引2的位置创建行(最顶端的行开始的第二行)
HSSFRow rowRowName2 = sheet.createRow(3); // 在索引2的位置创建行(最顶端的行开始的第二行)
// 将列头设置到sheet的单元格中
for (int n = 0; n < columnNum; n++) {
HSSFCell cellRowName = rowRowName.createCell(n); //创建列头对应个数的单元格
cellRowName.setCellType(HSSFCell.CELL_TYPE_STRING);
cellRowName.setCellStyle(headStyle);
HSSFCell cellRowName2 = rowRowName2.createCell(n); //创建列头对应个数的单元格
cellRowName2.setCellType(HSSFCell.CELL_TYPE_STRING);
cellRowName2.setCellStyle(headStyle);
sheet.addMergedRegion(new CellRangeAddress(2, 3, n, n));
//设置列头单元格的数据类型
HSSFRichTextString text = new HSSFRichTextString(rowName[n]);
cellRowName.setCellValue(text); //设置列头单元格的值
// HSSFCell cell = row.createCell((short) cellIndex);
//创建表头
HSSFRow row=sheet.createRow(1);
//合并列
HSSFCell cell=row.createCell(0);
cell.setCellValue(sheetTitle);
// CellRangeAddress callRangeAddress1 = new CellRangeAddress(1,1,0,2);;
cell.setCellType(HSSFCell.CELL_TYPE_STRING);
cell.setCellStyle(headStyle);
}
//将查询出的数据设置到sheet对应的单元格中
List<TCompany> dataList = countList;
System.out.println(dataList.size());
for (int i = 0; i < dataList.size(); i++) {
HSSFRow row = sheet.createRow(i + 6);//创建所需的行数
HSSFCell cell1 = row.createCell(0, HSSFCell.CELL_TYPE_STRING);
cell1.setCellStyle(style);
cell1.setCellValue(i + 1);
HSSFCell cell2 = row.createCell(1, HSSFCell.CELL_TYPE_STRING);
cell2.setCellStyle(style);
cell2.setCellValue(dataList.get(i).getCompanyName());
HSSFCell cell3 = row.createCell(2, HSSFCell.CELL_TYPE_STRING);
cell3.setCellStyle(style);
cell3.setCellValue(dataList.get(i).getCompanyUrl());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
详细代码
package com.mbyte.easy.admin.controller;
import java.io.IOException;
import java.util.Arrays;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mbyte.easy.admin.entity.FiveEightCity;
import com.mbyte.easy.admin.service.IFiveEightCityService;
import com.sun.org.apache.xerces.internal.parsers.DOMParser;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import javax.xml.parsers.ParserConfigurationException;
import static java.lang.Integer.parseInt;
public class TestHttp {
/**
* 获取总页数
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
System.out.print("请输入需要查找的公司关键字:");
Scanner in = new Scanner(System.in);
String Keyword=in.next();
System.out.print("请输入需要查找的公司所在城市:");
Scanner in1 = new Scanner(System.in);
String city = in1.next();
TestHttp pinyin11 = new TestHttp();
String cityPI = pinyin11.getPinYin(city);
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"+cityPI);
String url = "https://"+cityPI+".******.com/job/?key="+Keyword+"&classpolicy=main_null,job_A&final=1&jump=1";
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
//用正则解析代码
String a = "<a[^>]*>([^<]*)</a>";
String aTag = "<a[^>]*>([^<]*)</a>";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
//判断地址是否规范,如果没有搜职位即是主页面
String judge = "false";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
//System.out.println("解析a为:"+allMessage);
//使用正则表达式
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
// System.out.println("解析doubleAddress为:"+doubleAddress);
//System.out.println("汉字长度为"+matcher.replaceAll(""));
//System.out.println("汉字为:"+doubleAddress);
if(doubleAddress.equals("搜职位")){
judge = "true";
}
}
//获取页面信息的总页数
String iTag = "<i[^>]*>([^<]*)</i>";
Pattern iPattern = Pattern.compile(iTag);
Matcher iMatcher = iPattern.matcher(content);
String allPage = "-1";
while (iMatcher.find()){
String iMessage = iMatcher.group();
//System.out.println("解析i为:"+ iMessage);
if(iMessage.length()>=27 && iMessage.length()<=28){
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(iMessage);
String haveChinese = matcher.replaceAll("");
if(haveChinese.length()==0){
//System.out.println("解析i为:"+ iMessage);
String regEx3 = "[0-9]";
allPage = matchResult(Pattern.compile(regEx3),iMessage);
//System.out.println("解析allPage为:"+ allPage);
}
}
}
//System.out.println("解析allPage为:"+ allPage);
//System.out.println("judge:"+ judge);
TestHttp testHttp = new TestHttp();
int page = parseInt(allPage);
//加入判断看信息是否正确,且信息有几页,采用不同的方式调用爬取方法
if(judge.equals("true")){
if(page == -1){
System.out.println("没有与\""+Keyword+"\"关键字匹配的信息!");
}else if(page == 1){
testHttp.branchPage(page,Keyword);
}else{
for(int i = 1 ; i <= page ; i++){
testHttp.branchPage(i,Keyword);
}
}
}else{
System.out.println("输入的\""+city+"\"地址不存在!");
}
}
/**
* 获取字符串中的数字
* @param p
* @param str
* @return
*/
public static String matchResult(Pattern p,String str)
{
StringBuilder sb = new StringBuilder();
Matcher m = p.matcher(str);
while (m.find())
for (int i = 0; i <= m.groupCount(); i++)
{
sb.append(m.group());
}
return sb.toString();
}
/**
* 爬取信息
* @param page
* @param keyword
* @throws IOException
*/
public void branchPage(int page,String keyword) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
String url = "https://bd.58.com/job/?key=" + keyword + "&classpolicy=main_null,job_A&final=1&jump=1&page=" + page;
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 设置爬取时间间隔 5s
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
System.out.println("==================================================================================================");
// String a = "<a[^>]*>([^<]*)</a>";
//用正则解析代码,解析a标签
String aTag = "<a[^>]*>([^<]*)</a>";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
if (allMessage.length() >= 235 && allMessage.length() <= 262) {
//使用正则表达式,用StringAPI获取想要信息
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
//System.out.println("汉字长度为"+matcher.replaceAll(""));
if (doubleAddress.length() > 8) {
String href = allMessage.substring(10, 43);
if(href.indexOf("https") != -1){
String hrefs = href.replaceAll(" ", "");
//System.out.println("hrefs:" + hrefs);
if(href.length() > hrefs.length()){
String hrefUrl = hrefs.substring(0, 28);
if(hrefUrl.indexOf("\"") != -1){
String hrefUrls = hrefUrl.substring(0,hrefUrl.length()-1);
System.out.println("链接:" + hrefUrls);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}
}else{
if(href.indexOf("\"") != -1){
String hrefUrls = href.substring(0,href.length()-1);
System.out.println("链接:" + hrefUrls);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}else{
System.out.println("链接:" + href);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}
}
}
}
}
}
}
/**
* 将汉字转换成拼音
* @param inputString
* @return
*/
public static String getPinYin(String inputString) {
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
char[] input = inputString.trim().toCharArray();
String output = "";
try {
for (int i = 0; i < input.length; i++) {
if (java.lang.Character.toString(input[i]).matches("[\\u4E00-\\u9FA5]+")) {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format);
output += temp[0];
} else
output += java.lang.Character.toString(input[i]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return output;
}
}