1.安装依赖以及相应驱动
pip install playwright
python -m playwright install
2.参考示例
# -*- coding: utf-8 -*-
# @Time : 2021/05/07 14:50
# @Author :
from playwright.sync_api import sync_playwright
def run(playwright):
browser = playwright.chromium.launch(headless=False)
# context = browser.contexts
# Open new page
page = browser.new_page()
page.goto("https://www.baidu.com/")
page.click('input[name="wd"]')
page.fill('input[name="wd"]', 'jingdong')
page.click('text="京东"')
# 当前页面内容
# html = page.content()
with page.expect_navigation():
with page.expect_popup() as popup_info:
# normalize-space这个方法就可以去除文本中的前后空格和回车
page.click("//a[normalize-space(.)='京东JD.COM官网 多快好省 只为品质生活']")
# popup_info.value
import time
time.sleep(10)
browser.close()
with sync_playwright() as playwright:
run(playwright)
参考:https://pypi.org/project/playwright/
参考:https://www.imooc.com/article/314264
3.playwright异步爬虫
# -*- coding: utf-8 -*-
# @Time : 2023/6/2 9:52
# @Author : Cocktail_py
import json
import asyncio
from playwright.async_api import async_playwright
'''
playwright==1.18.1
'''
async def on_request(request):
pass
# all_headers_dict = await request.all_headers()
# print(dict(all_headers_dict))
async def on_response(response):
pass
if 'ajax-get-res-v5' in response.url:
json_data = json.dumps(await response.json())
print(json_data)
async def start(kw='n200'):
playwright = await async_playwright().start()
# 设置代理
browser = await playwright.chromium.launch(
headless=False,
# 添加代理
proxy={"server": "http://xxxx:10010",
"username": "",
"password": ""
},
# 消除特征
args=["--enable-automation=true",
'--disable-blink-features=AutomationControlled']
)
context = await browser.new_context(ignore_https_errors=True)
page = await browser.new_page(
viewport={'width':500,'height':500},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
# 消除特征
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
# context.add_init_script("Object.defineProperties(navigator, {webdriver:{get:()=>false}});")
# context.add_init_script("const newProto = navigator.__proto__; delete newProto.webdriver; navigator.__proto__ = newProto;")
await page.add_init_script(js)
page.on("requestfinished", on_request)
page.on('response', on_response)
page.set_default_timeout(0)
await page.goto(f"https://www.baidu.com/s?wd={kw}")
await page.wait_for_load_state('networkidle') # 等待网页加载完毕
# 判断元素是否存在,count存在返回1,不存在返回0
# get_attribute 当前元素不存在,会导致程序阻塞
count = await page.locator('//div[@class="search-service-arrest-bc" and @style="display: block;"]').count()
print(kw, 'style',count)
# await context.clear_cookies()
await context.close()
await page.close()
await browser.close()
async def main():
# 封装异步任务
# while True:
tasks = []
for kw in ["n200",
]:
tasks.append(asyncio.create_task(start(kw)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())
# playwright undetectable
# https://github.com/QIN2DIM/undetected-playwright/tree/main
# https://www.jianshu.com/p/ce0919d5b47f
# 参考:https://blog.csdn.net/qq_27371025/article/details/129766836
# 参考:https://blog.csdn.net/weixin_50829653/article/details/130683629
# 参考:https://playwright.dev/docs/intro
# https://github.com/microsoft/playwright/issues/12267
# https://yifei.me/note/849
# https://intoli.com/blog/making-chrome-headless-undetectable/