需求:本地一批关键词,需要查询百度指数,5118一次只能查100条,5W不得手软,看接口0.013元1条,网上找了百度指数接口,都不怎么便宜,那就自己动手了,用webmagic也是很方便快捷的。
1、配置
webmagic两个包、打包成jar可以挂在服务器慢慢运行。
<dependencies>
<!-- webmagic核心包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<!-- webmagic扩展包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5.5</version>
<configuration>
<archive>
<manifest>
<mainClass>com.booy.task.OrderProcessor</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
log4j日志文件:
log4j.rootLogger=INFO,A1
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
2、主类
public class BaiduZhishu implements PageProcessor {
@Override
public void process(Page page) {
Document doc = page.getHtml().getDocument();
String docStr = doc.toString();
Pattern statusP = Pattern.compile("status\\\":\\d*?(?=,)");
Matcher statusM = statusP.matcher(docStr);
statusM.find();
String statusGroup = statusM.group();
int i = Integer.parseInt(statusGroup.substring(8, statusGroup.length()));
String allZhishu = null;//综合指数
String pcZhishu = null;//pc指数
String wiseZhishu = null;//移动指数
int statusCode = -1;
//正常请求有指数
if (i == 0) {
//综合指数
Pattern allP = Pattern.compile("all\\\":\\{\\\"avg\\\":\\d*?(?=,)");
Matcher allM = allP.matcher(docStr);
if (allM.find()) {
String ZhishuSuf = allM.group();
allZhishu = ZhishuSuf.substring(12, ZhishuSuf.length());
}
//pc指数
Pattern pcP = Pattern.compile("pc\\\":\\{\\\"avg\\\":\\d*?(?=,)");
Matcher pcM = pcP.matcher(docStr);
if (pcM.find()) {
String ZhishuSuf = pcM.group();
pcZhishu = ZhishuSuf.substring(11, ZhishuSuf.length());
}
//移动指数
Pattern wiseP = Pattern.compile("wise\\\":\\{\\\"avg\\\":\\d*?(?=,)");
Matcher wiseM = wiseP.matcher(docStr);
if (wiseM.find()) {
String ZhishuSuf = wiseM.group();
wiseZhishu = ZhishuSuf.substring(13, ZhishuSuf.length());
}
statusCode = 0;
//没有指数
} else if (i == 10002) {
statusCode = 10002;
} else if (i == 10000) {
System.out.println("账号没有登录!");
page.setSkip(true);
} else {
page.setSkip(true);
System.out.println("出现了其他错误");
}
String url = page.getUrl().toString();
String keywords = url.substring(76, url.length() - 33);
StringBuffer sbf = new StringBuffer();
sbf.append(keywords).append("#")
.append(statusCode).append("#")
.append(allZhishu).append("#")
.append(pcZhishu).append("#")
.append(wiseZhishu);
String dataZhishu = sbf.toString();
page.putField("data", dataZhishu);
}
private Site site = Site.me()
.setCharset("utf8") //编码
.setTimeOut(1000) //超时时间 ms毫秒
.setCycleRetryTimes(10)//设置循环重试次数
.setSleepTime(300) //每次请求间隔时间
.setUserAgent(randomUserAgent())//随机UserAgent
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
.addHeader("Cookie", baiduCookie);
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
//系统开始运行时间
long start = System.currentTimeMillis();
DateFormat df = new SimpleDateFormat("yyyy年MM月dd日HH:mm:ss");
//开始时间
startTime(start, df);
List<String> keywords = TxtKeywords.getKeywords("D:\\data\\keywords.txt");
int size = keywords.size();
int i = 0;
for (i = 0; i < keywords.size(); i++) {
String keyword = keywords.get(i);
String zhishuapi = "https://zhishu.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22" + keyword + "%22,%22wordType%22:1%7D]]&days=90";
Spider.create(new BaiduZhishu())
.addUrl(zhishuapi)
.addPipeline(new TxtPipeline())
.thread(3)
.run();
//10次提示,100次提示,每500次提示
if ((i + 1) % 500 == 0 || (i + 1) == 100 || (i + 1) == 10) {
runTime(start, size, i, df);
}
}
//结束时间
endTime(start, df);
}
}
UA和账号cookie
public class ParamsSite {
//随机获取UserAgent
private static List<String> UserAgents = new ArrayList<>();
public static String randomUserAgent() {
Random r = new Random();
int nextInt = r.nextInt(UserAgents.size());
return UserAgents.get(nextInt);
}
static {
UserAgents.add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36");
UserAgents.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36");
}
public static String baiduCookie = "";
}
查询结果输出到txt文本
public class TxtPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
String data = resultItems.get("data");
StringBuffer sbf = new StringBuffer();
sbf.append(data).append("\r\n");
String dataZhishu = sbf.toString();
//写入文件
File file = new File("D:\\data\\baiduzhishu.txt");
try {
if (!file.exists()) {
file.createNewFile();
}
OutputStream out = new FileOutputStream(file, true);
BufferedOutputStream bos = new BufferedOutputStream(out);
bos.write(dataZhishu.getBytes());
bos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
运行时间统计
package com.booy.utils;
import java.text.DateFormat;
public class RunTime {
//时间统计提示
public static void runTime(long start, int keywordsLen, int i, DateFormat df) {
long end = System.currentTimeMillis();
long time = (end - start);//
String endDate = df.format(end);
//单个信息所需平均时间
long timeAVG = time / i;
//剩余查询数量
int unfinishNum = keywordsLen - (i + 1);
//剩余查询数量所需时间
long estimatedTime = timeAVG * unfinishNum / 1000;
//已经查询时间统计
long s = time / 1000 % 60;
long m = (time / 1000 / 60) % 60;
long h = time / 1000 / (60 * 60);
//预估时间统计
long ess = estimatedTime % 60;
long esm = (estimatedTime / 60) % 60;
long esh = estimatedTime / (60 * 60);
System.out.println("==============================================");
System.out.println("查询成功" + (i + 1) + "条数据," + "所需时间:" + h + "h:" + m + "m:" + s + "S");
System.out.println("时间:" + endDate);
//预估耗时
System.out.println("还剩余" + unfinishNum + "个关键词待查询,预计还需:" + esh + "h:" + esm + "m:" + ess + "S完成");
System.out.println("程序正在查询中...");
}
//结束时间
public static void endTime(long start, DateFormat df) {
long end = System.currentTimeMillis();
long time = (end - start) / 1000;//
String endDate = df.format(end);
long s = time % 60;
long m = (time / 60) % 60;
long h = time / (60 * 60);
System.out.println("==============================================");
System.out.println("程序运行结束!" + "总共运行时间:" + h + "h:" + m + "m:" + s + "S");
System.out.println("时间:" + endDate);
}
//开始时间
public static void startTime(long start, DateFormat df) {
String startDate = df.format(start);
System.out.println("系统开始运行:数据查询中...");
System.out.println("运行时间:" + startDate);
}
}
txt文件读取关键词
public class TxtKeywords {
public static List<String> getKeywords(String filepath){
try{
String temp = null;
File f = new File(filepath);
//指定读取编码用于读取中文
InputStreamReader read = new InputStreamReader(new FileInputStream(f),"GBK");
List<String> readList = new ArrayList<>();
BufferedReader reader=new BufferedReader(read);
while((temp=reader.readLine())!=null &&!"".equals(temp)){
readList.add(temp);
}
reader.close();
return readList;
}catch (Exception e) {
System.out.println("文件路径错误或者文件不存在");
e.printStackTrace();
return null;
}
}
}