package com.mengkeng.selenium_demo.test;
import com.alibaba.fastjson.JSON;
import com.mengkeng.selenium_demo.entity.BuildAreaUrlLj;
import com.mengkeng.selenium_demo.entity.IdAndNamePO;
import com.mengkeng.selenium_demo.entity.TkBuildingsAreaInfolj;
import com.mengkeng.selenium_demo.entity.TkBuildingsMonthPriceLj;
import com.mengkeng.selenium_demo.mapper.BuildAreaUrlLjMapper;
import com.mengkeng.selenium_demo.service.ProxyService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateFormatUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.PageLoadStrategy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.HashOperations;
import org.springframework.data.redis.core.SetOperations;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Date: 2022-09-05 13:58
* Description: 小区
*/
@RestController
@RequestMapping("areaInfo")
@Slf4j
public class LianjiaAreaInfoDemo {
@Autowired
private StringRedisTemplate redisTemplate;
@Autowired
private BuildAreaUrlLjMapper buildAreaUrlLjMapper;
@Autowired
private ProxyService proxyService;
public static final String SKIP_URLS = "SKIP_URLS_AREAINFO_LIANJIA";
public static final String URLS = "URLS_AREAINFO_LIANJIA";
public static final String AREA_INFO_COMMUNITY_CODE_LJ = "AREA_INFO_COMMUNITY_CODE_LJ";
private static LinkedList<String> pages = new LinkedList<>();
ThreadPoolExecutor pagepoolExecutor = new ThreadPoolExecutor(2,
10, 30L,
TimeUnit.SECONDS, new LinkedBlockingQueue<>());
@RequestMapping("sync")
public void sync() throws InterruptedException {
System.setProperty("webdriver.chrome.driver", "D://chromedriver.exe");
boolean flag = false;
while (!flag) {
try {
ChromeDriver driver = getChromeDriver();
SetOperations ops = redisTemplate.opsForSet();
try {
getUrls(driver, ops);
parsePagePre(ops);
} finally {
sleep(1000);
driver.quit();
}
} catch (Exception e) {
Thread.sleep(10000);
continue;
}
flag = true;
}
System.out.println("完成");
}
/**
* 获取浏览器对象
* @return
*/
private ChromeDriver getChromeDriver() {
String nextProxy = proxyService.getNextProxy();
System.out.println("当前ip是" + nextProxy);
String[] arr = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"};
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.setPageLoadStrategy(PageLoadStrategy.EAGER);
chromeOptions.addArguments("--incognito");
chromeOptions.addArguments("--blink-settings=imagesEnabled=false");
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");
chromeOptions.addArguments("--disable-gpu");
if (StringUtils.isNotBlank(nextProxy) && !nextProxy.equals("local")) {
chromeOptions.addArguments("--proxy-server=" + nextProxy);
}
HashMap<String, Object> map = new HashMap<>();
map.put("webrtc.ip_handling_policy", "disable_non_proxied_udp");
map.put("webrtc.multiple_routes_enabled", false);
map.put("webrtc.nonproxied_udp_enabled", false);
chromeOptions.setExperimentalOption("prefs", map);
Random random = new Random();
chromeOptions.addArguments("User-Agent=" + arr[random.nextInt(7)]);
ChromeDriver driver = new ChromeDriver(chromeOptions);
driver.manage().window().maximize();
return driver;
}
private void parsePagePre(SetOperations ops) {
HashOperations<String, Object, Object> opsForHash = redisTemplate.opsForHash();
List<BuildAreaUrlLj> buildAreaUrlLjs = buildAreaUrlLjMapper.selectList(null);
List<BuildAreaUrlLj> buildAreaUrlLjs1 = buildAreaUrlLjs.subList(1,3500);
for (BuildAreaUrlLj buildAreaUrlLj : buildAreaUrlLjs1) {
if (ops.isMember(SKIP_URLS, buildAreaUrlLj.getAreaUrl())) {
System.out.println("跳过当前区域" + buildAreaUrlLj.getCityName() + "-" + buildAreaUrlLj.getCountyName());
continue;
}
pagepoolExecutor.execute(() -> parsePage(ops, opsForHash, buildAreaUrlLj));
}
}
/**
* 解析列表
* @param ops
* @param opsForHash
* @param buildAreaUrlLj
*/
private void parsePage(SetOperations ops, HashOperations<String, Object, Object> opsForHash, BuildAreaUrlLj buildAreaUrlLj) {
ChromeDriver driver = getChromeDriver();
try {
driver.get(buildAreaUrlLj.getAreaUrl());
String windowHandlePage = driver.getWindowHandle();
WebElement totalNumStr = validElement("//h2[@class='total fl']/span", driver);
if (null != totalNumStr) {
Integer total = Integer.valueOf(totalNumStr.getText());
// 有数据
if (total > 1) {
String pageData = driver.findElement(By.xpath("//div[@class='page-box house-lst-page-box']")).getAttribute("page-data");
Integer pageNumStr = Integer.valueOf(JSON.parseObject(pageData).getString("totalPage"));
System.out.println("当前区域页数" + pageNumStr + "---" + buildAreaUrlLj.getAreaUrl());
for (int x = 1; x <= pageNumStr; x++) {
List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a"));
for (int i = 0; i < elements.size(); i++) {
WebElement item = elements.get(i);
String code = "";
Pattern compile1 = Pattern.compile("xiaoqu/(\\w+)/");
Matcher matcher1 = compile1.matcher(item.getAttribute("href"));
while (matcher1.find()) {
code = matcher1.group(1);
}
driver.executeScript("arguments[0].click();", item);
sleepAndCutoverNewPage(300, driver);
评论0