Python 微软开源自动化工具 Playwright

本文介绍了如何使用Playwright库进行异步爬虫开发,包括安装依赖、创建异步任务,以及在Chromium环境下设置代理和隐藏自动化特征。通过示例展示了如何模拟浏览器行为,抓取网页内容并进行网络请求分析。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1.安装依赖以及相应驱动
pip install playwright
python -m playwright install
2.参考示例
# -*- coding: utf-8 -*-
# @Time    : 2021/05/07 14:50
# @Author  : 

from playwright.sync_api import sync_playwright


def run(playwright):
    browser = playwright.chromium.launch(headless=False)
    # context = browser.contexts
    # Open new page
    page = browser.new_page()

    page.goto("https://www.baidu.com/")

    page.click('input[name="wd"]')

    page.fill('input[name="wd"]', 'jingdong')

    page.click('text="京东"')

    # 当前页面内容
    # html = page.content()
    with page.expect_navigation():
        with page.expect_popup() as popup_info:
            # normalize-space这个方法就可以去除文本中的前后空格和回车
            page.click("//a[normalize-space(.)='京东JD.COM官网 多快好省 只为品质生活']")

        # popup_info.value

    import time
    time.sleep(10)
    browser.close()


with sync_playwright() as playwright:
    run(playwright)

在这里插入图片描述

参考:https://pypi.org/project/playwright/
参考:https://www.imooc.com/article/314264

3.playwright异步爬虫
# -*- coding: utf-8 -*-
# @Time    : 2023/6/2 9:52
# @Author  : Cocktail_py
import json
import asyncio
from playwright.async_api import async_playwright

'''
playwright==1.18.1
'''

async def on_request(request):
    pass
    # all_headers_dict = await request.all_headers()
    # print(dict(all_headers_dict))
    
async def on_response(response):
    pass
    if 'ajax-get-res-v5' in response.url:
        json_data = json.dumps(await response.json())
        print(json_data)

async def start(kw='n200'):
    playwright = await async_playwright().start()
    # 设置代理
    browser = await playwright.chromium.launch(
        headless=False,
                                               # 添加代理
                                               proxy={"server": "http://xxxx:10010",
                                                      "username": "",
                                                      "password": ""
                                                      },
        # 消除特征
        args=["--enable-automation=true",
              '--disable-blink-features=AutomationControlled']
    )
    context = await browser.new_context(ignore_https_errors=True)
    page = await browser.new_page(
        viewport={'width':500,'height':500},
        user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
    # 消除特征
    js = """
                    Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
                    """
    # context.add_init_script("Object.defineProperties(navigator, {webdriver:{get:()=>false}});")
    # context.add_init_script("const newProto = navigator.__proto__; delete newProto.webdriver; navigator.__proto__ = newProto;")
    await page.add_init_script(js)

    page.on("requestfinished", on_request)
    page.on('response', on_response)
    page.set_default_timeout(0)
    await page.goto(f"https://www.baidu.com/s?wd={kw}")
    await page.wait_for_load_state('networkidle') # 等待网页加载完毕
    # 判断元素是否存在,count存在返回1,不存在返回0
    # get_attribute 当前元素不存在,会导致程序阻塞
    count = await page.locator('//div[@class="search-service-arrest-bc" and @style="display: block;"]').count()
    print(kw, 'style',count)
    # await context.clear_cookies()
    await context.close()
    await page.close()
    await browser.close()

async def main():
    # 封装异步任务
    # while True:
    tasks = []
    for kw in ["n200",
               ]:
        tasks.append(asyncio.create_task(start(kw)))
    await asyncio.wait(tasks)


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())


# playwright undetectable
# https://github.com/QIN2DIM/undetected-playwright/tree/main

# https://www.jianshu.com/p/ce0919d5b47f
# 参考:https://blog.csdn.net/qq_27371025/article/details/129766836
# 参考:https://blog.csdn.net/weixin_50829653/article/details/130683629
# 参考:https://playwright.dev/docs/intro
# https://github.com/microsoft/playwright/issues/12267
# https://yifei.me/note/849
# https://intoli.com/blog/making-chrome-headless-undetectable/
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Cocktail_py

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值