webmagic百度指数查询

本文介绍了如何利用WebMagic框架,配合Java技术,自行搭建一个爬虫来批量查询百度指数,以此避免购买昂贵的接口费用。通过设置UA和账号cookie,爬虫将查询结果输出到TXT文本,并统计运行时间。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

需求:本地一批关键词,需要查询百度指数,5118一次只能查100条,5W不得手软,看接口0.013元1条,网上找了百度指数接口,都不怎么便宜,那就自己动手了,用webmagic也是很方便快捷的。
1、配置
webmagic两个包、打包成jar可以挂在服务器慢慢运行。

    <dependencies>
        <!-- webmagic核心包 -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.4</version>
        </dependency>
        <!-- webmagic扩展包 -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.4</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.5.5</version>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.booy.task.OrderProcessor</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>
        </plugins>
    </build>

log4j日志文件:

log4j.rootLogger=INFO,A1 
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

2、主类

public class BaiduZhishu implements PageProcessor {

    @Override
    public void process(Page page) {
        Document doc = page.getHtml().getDocument();
        String docStr = doc.toString();
        
        Pattern statusP = Pattern.compile("status\\\":\\d*?(?=,)");
        Matcher statusM = statusP.matcher(docStr);
        statusM.find();
        String statusGroup = statusM.group();
        int i = Integer.parseInt(statusGroup.substring(8, statusGroup.length()));

        String allZhishu = null;//综合指数
        String pcZhishu = null;//pc指数
        String wiseZhishu = null;//移动指数
        int statusCode = -1;

        //正常请求有指数
        if (i == 0) {
            //综合指数
            Pattern allP = Pattern.compile("all\\\":\\{\\\"avg\\\":\\d*?(?=,)");
            Matcher allM = allP.matcher(docStr);
            if (allM.find()) {

                String ZhishuSuf = allM.group();
                allZhishu = ZhishuSuf.substring(12, ZhishuSuf.length());
            }

            //pc指数
            Pattern pcP = Pattern.compile("pc\\\":\\{\\\"avg\\\":\\d*?(?=,)");
            Matcher pcM = pcP.matcher(docStr);
            if (pcM.find()) {
                String ZhishuSuf = pcM.group();
                pcZhishu = ZhishuSuf.substring(11, ZhishuSuf.length());
            }

            //移动指数
            Pattern wiseP = Pattern.compile("wise\\\":\\{\\\"avg\\\":\\d*?(?=,)");
            Matcher wiseM = wiseP.matcher(docStr);

            if (wiseM.find()) {
                String ZhishuSuf = wiseM.group();
                wiseZhishu = ZhishuSuf.substring(13, ZhishuSuf.length());
            }
            statusCode = 0;

            //没有指数
        } else if (i == 10002) {
            statusCode = 10002;
        } else if (i == 10000) {
            System.out.println("账号没有登录!");
            page.setSkip(true);
        } else {
            page.setSkip(true);
            System.out.println("出现了其他错误");
        }
        String url = page.getUrl().toString();
        String keywords = url.substring(76, url.length() - 33);

        StringBuffer sbf = new StringBuffer();
        sbf.append(keywords).append("#")
                .append(statusCode).append("#")
                .append(allZhishu).append("#")
                .append(pcZhishu).append("#")
                .append(wiseZhishu);
        String dataZhishu = sbf.toString();
        page.putField("data", dataZhishu);
    }

    private Site site = Site.me()
            .setCharset("utf8") //编码
            .setTimeOut(1000)  //超时时间 ms毫秒
            .setCycleRetryTimes(10)//设置循环重试次数
            .setSleepTime(300) //每次请求间隔时间
            .setUserAgent(randomUserAgent())//随机UserAgent
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
            .addHeader("Cookie", baiduCookie);

    @Override
    public Site getSite() {
        return site;
    }


    public static void main(String[] args) {
        //系统开始运行时间
        long start = System.currentTimeMillis();
        DateFormat df = new SimpleDateFormat("yyyy年MM月dd日HH:mm:ss");
        //开始时间
        startTime(start, df);

        List<String> keywords = TxtKeywords.getKeywords("D:\\data\\keywords.txt");
        int size = keywords.size();
        int i = 0;
        for (i = 0; i < keywords.size(); i++) {
            String keyword = keywords.get(i);
            String zhishuapi = "https://zhishu.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22" + keyword + "%22,%22wordType%22:1%7D]]&days=90";
            Spider.create(new BaiduZhishu())
                    .addUrl(zhishuapi)
                    .addPipeline(new TxtPipeline())
                    .thread(3)
                    .run();

            //10次提示,100次提示,每500次提示
            if ((i + 1) % 500 == 0 || (i + 1) == 100 || (i + 1) == 10) {
                runTime(start, size, i, df);
            }
        }
        //结束时间
        endTime(start, df);
    }
}

UA和账号cookie

public class ParamsSite {
    //随机获取UserAgent
    private static List<String> UserAgents = new ArrayList<>();
    public static String randomUserAgent() {
        Random r = new Random();
        int nextInt = r.nextInt(UserAgents.size());
        return UserAgents.get(nextInt);
    }
    static {
        UserAgents.add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36");
        UserAgents.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36");
    }

    public static String baiduCookie = "";
}

查询结果输出到txt文本

public class TxtPipeline implements Pipeline {
    @Override
    public void process(ResultItems resultItems, Task task) {
        String data = resultItems.get("data");
        StringBuffer sbf = new StringBuffer();
        sbf.append(data).append("\r\n");
        String dataZhishu = sbf.toString();
        //写入文件
            File file = new File("D:\\data\\baiduzhishu.txt");
            try {
                if (!file.exists()) {
                    file.createNewFile();
                }
                OutputStream out = new FileOutputStream(file, true);
                BufferedOutputStream bos = new BufferedOutputStream(out);
                bos.write(dataZhishu.getBytes());
                bos.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

运行时间统计

package com.booy.utils;

import java.text.DateFormat;

public class RunTime {
    //时间统计提示
    public static void runTime(long start, int keywordsLen, int i, DateFormat df) {
        long end = System.currentTimeMillis();
        long time = (end - start);//
        String endDate = df.format(end);
        //单个信息所需平均时间
        long timeAVG = time / i;
        //剩余查询数量
        int unfinishNum = keywordsLen - (i + 1);
        //剩余查询数量所需时间
        long estimatedTime = timeAVG * unfinishNum / 1000;

        //已经查询时间统计
        long s = time / 1000 % 60;
        long m = (time / 1000 / 60) % 60;
        long h = time / 1000 / (60 * 60);
        //预估时间统计
        long ess = estimatedTime % 60;
        long esm = (estimatedTime / 60) % 60;
        long esh = estimatedTime / (60 * 60);

        System.out.println("==============================================");
        System.out.println("查询成功" + (i + 1) + "条数据," + "所需时间:" + h + "h:" + m + "m:" + s + "S");
        System.out.println("时间:" + endDate);
        //预估耗时
        System.out.println("还剩余" + unfinishNum + "个关键词待查询,预计还需:" + esh + "h:" + esm + "m:" + ess + "S完成");
        System.out.println("程序正在查询中...");

    }
    //结束时间
    public static void endTime(long start, DateFormat df) {
        long end = System.currentTimeMillis();
        long time = (end - start) / 1000;//
        String endDate = df.format(end);
        long s = time % 60;
        long m = (time / 60) % 60;
        long h = time / (60 * 60);
        System.out.println("==============================================");
        System.out.println("程序运行结束!" + "总共运行时间:" + h + "h:" + m + "m:" + s + "S");
        System.out.println("时间:" + endDate);
    }
    //开始时间
    public static void startTime(long start, DateFormat df) {
        String startDate = df.format(start);
        System.out.println("系统开始运行:数据查询中...");
        System.out.println("运行时间:" + startDate);
    }
}

txt文件读取关键词

public class TxtKeywords {
    public static List<String> getKeywords(String filepath){
        try{
            String temp = null;
            File f = new File(filepath);
//指定读取编码用于读取中文
            InputStreamReader read = new InputStreamReader(new FileInputStream(f),"GBK");
            List<String> readList = new ArrayList<>();
            BufferedReader reader=new BufferedReader(read);
            while((temp=reader.readLine())!=null &&!"".equals(temp)){
                readList.add(temp);
            }
            reader.close();
            return readList;
        }catch (Exception e) {
            System.out.println("文件路径错误或者文件不存在");
            e.printStackTrace();
            return null;
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值