JAVA爬取网页内容

最新推荐文章于 2025-07-11 11:53:51 发布

原创最新推荐文章于 2025-07-11 11:53:51 发布 · 279 阅读

CC 4.0 BY-SA版权

创建maven工程

pom依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.dy</groupId>
    <artifactId>crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
  <dependencies>
      <dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpclient</artifactId>
          <version>4.5.2</version>
      </dependency>
      <dependency>
          <groupId>org.slf4j</groupId>
          <artifactId>slf4j-log4j12</artifactId>
          <version>1.7.25</version>
          <scope>test</scope>
      </dependency>
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.11.2</version>
      </dependency>
      <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.5</version>
      </dependency>
  </dependencies>

</project>

爬取网页数据

GetCSS.java

package cn.carwler.test;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;

public class GetWebPageContent {
    public static void main(String[] args) {
        //打开浏览器
        CloseableHttpClient httpclient = HttpClients.createDefault();
        //创建httpget请求

        HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");

        // 类似与在浏览器地址栏中输入回车,获得网页内容
        try {
            //发起httpclient请求
             CloseableHttpResponse response = httpclient.execute(httpget);
            if (response.getStatusLine().getStatusCode() >= 200 && response.getStatusLine().getStatusCode() < 300) {
                HttpEntity entity = response.getEntity();
                File f = new File("E://crawler//crawler.txt");
                entity.writeTo(new BufferedOutputStream(new FileOutputStream(f)));
            } else {

                Date date = new Date();
                System.out.println(date);
                System.exit(0);
                throw new ClientProtocolException("Unexpected response status: ");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

爬取网页图片:

GetPictureByUrl.java

package cn.carwler.test;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

public class GetPictureByUrl {
    public static void main(String[] args) throws IOException {
        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例
        HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");

        // 执行get请求
        CloseableHttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        // 获取返回实体
        String content = EntityUtils.toString(entity, "GBK");
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(content);

        // 获取指定的 <img />

        Elements elements = doc.select("img[src^=/]");

            for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            // 获取 <img /> 的 src
            String url = element.attr("src");
            // 再发请求最简单了，并由于该链接是没有 https:开头的，得人工补全 ✔
            HttpGet PicturehttpGet = new HttpGet("http://www.ktbdqn.com/"+url);
            CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
            HttpEntity pictureEntity = pictureResponse.getEntity();
            InputStream inputStream = pictureEntity.getContent();

            // 使用 common-io 下载图片到本地，注意图片名不能重复 ✔
           FileUtils.copyToFile(inputStream, new File("E://" + url));

            pictureResponse.close(); // pictureResponse关闭

        }
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭

    }
}

爬取css样式:

GetCSS.java

package cn.carwler.test;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

public class GetCSS {
    public static void main(String[] args) throws IOException {
        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例
        HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");

        // 执行get请求
        CloseableHttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        // 获取返回实体
        String content = EntityUtils.toString(entity, "GBK");
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(content);

        // 获取指定的 <img />
        Elements elements = doc.select("link");
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            // 获取 <img /> 的 src
            String url = element.attr("href");

            // 再发请求最简单了，并由于该链接是没有 https:开头的，得人工补全 ✔
            HttpGet getcss = new HttpGet("http://www.ktbdqn.com/"+url);
            CloseableHttpResponse pictureResponse = httpclient.execute(getcss);
            HttpEntity pictureEntity = pictureResponse.getEntity();
            InputStream inputStream = pictureEntity.getContent();

            // 使用 common-io 下载图片到本地，注意图片名不能重复 ✔
            FileUtils.copyToFile(inputStream, new File("E://crawler//" +url));

            pictureResponse.close(); // pictureResponse关闭

        }
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭

    }

}