创建maven工程
pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dy</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
</dependencies>
</project>
爬取网页数据
GetCSS.java
package cn.carwler.test;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;
public class GetWebPageContent {
public static void main(String[] args) {
//打开浏览器
CloseableHttpClient httpclient = HttpClients.createDefault();
//创建httpget请求
HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");
// 类似与在浏览器地址栏中输入回车,获得网页内容
try {
//发起httpclient请求
CloseableHttpResponse response = httpclient.execute(httpget);
if (response.getStatusLine().getStatusCode() >= 200 && response.getStatusLine().getStatusCode() < 300) {
HttpEntity entity = response.getEntity();
File f = new File("E://crawler//crawler.txt");
entity.writeTo(new BufferedOutputStream(new FileOutputStream(f)));
} else {
Date date = new Date();
System.out.println(date);
System.exit(0);
throw new ClientProtocolException("Unexpected response status: ");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
爬取网页图片:
GetPictureByUrl.java
package cn.carwler.test;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class GetPictureByUrl {
public static void main(String[] args) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "GBK");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select("img[src^=/]");
for (int i = 0; i < elements.size(); i++) {
Element element = elements.get(i);
// 获取 <img /> 的 src
String url = element.attr("src");
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全 ✔
HttpGet PicturehttpGet = new HttpGet("http://www.ktbdqn.com/"+url);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("E://" + url));
pictureResponse.close(); // pictureResponse关闭
}
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
}
爬取css样式:
GetCSS.java
package cn.carwler.test;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class GetCSS {
public static void main(String[] args) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "GBK");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select("link");
for (int i = 0; i < elements.size(); i++) {
Element element = elements.get(i);
// 获取 <img /> 的 src
String url = element.attr("href");
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全 ✔
HttpGet getcss = new HttpGet("http://www.ktbdqn.com/"+url);
CloseableHttpResponse pictureResponse = httpclient.execute(getcss);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("E://crawler//" +url));
pictureResponse.close(); // pictureResponse关闭
}
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
}