【html格式化 re,bs4格式html,超牛逼!】

通过bs4,re格式化html代码;可以控制递归的深度
所有属性attrs都在一行中,免得乱七八糟的

import re
from bs4 import BeautifulSoup, Tag, NavigableString

sub_place_pattern = re.compile(r'\s+')

KEEP_LAST_TAG_ONE_LINE=True # 保留最后一个tag在一行中
INDENT_SPACE = " " * 2 # 缩进空格数量


def flatten_attributes(tag: Tag) -> str:
    """将属性格式化为单行"""
    attrs = []
    for attr, val in tag.attrs.items():
        if isinstance(val, list):
            val = ' '.join(val)
        else:
            val = str(val).strip()
        val = sub_place_pattern.sub(' ', val)
        attrs.append(f'{attr}="{val}"')
    return f"<{tag.name} {' '.join(attrs)}>"

def format_tag(tag: Tag, current_depth: int, max_depth: int, indent: str = INDENT_SPACE) -> str:
    if not isinstance(tag, Tag):
        return  sub_place_pattern.sub(' ', str(tag)).strip()

    # 没有子元素 or 都是字符串(无子标签)
    if KEEP_LAST_TAG_ONE_LINE and (not tag.contents or all(not isinstance(child, Tag) for child in tag.contents)):
        return sub_place_pattern.sub(' ', str(tag)).strip()
    
    start_tag = flatten_attributes(tag)
    end_tag = f"</{tag.name}>"

    if current_depth >= max_depth or not tag.contents:
        inner = " ".join(
            c.strip() if isinstance(c, NavigableString) else str(c)
            for c in tag.contents
        )
        return f"{start_tag} {sub_place_pattern.sub(' ', inner)} {end_tag}".strip()

    child_lines = []
    for child in tag.contents:
        if isinstance(child, NavigableString):
            text = child.strip()
            if text:
                child_lines.append(indent * (current_depth + 1) + text)
        elif isinstance(child, Tag):
            child_lines.append(indent * (current_depth + 1) + format_tag(child, current_depth + 1, max_depth, indent))

    formatted = [start_tag] + child_lines + [indent * current_depth + end_tag]
    return "\n".join(formatted)

def format_html_single_line_blocks(html_str: str, max_depth: int = 1) -> str:
    soup = BeautifulSoup(html_str, "html.parser")
    htmls = []
    for child in soup.contents:
        if isinstance(child, Tag):
            htmls.append(format_tag(child, 0, max_depth))
    htmls = "\n".join(htmls)
    # 删除tag label两端的空格
    htmls = re.sub(r' +>', ">", htmls)
    htmls = re.sub(r'< +', "<", htmls)
    # 删除上下层级tag紧邻处的空格
    htmls = re.sub(r'> +', ">", htmls)
    htmls = re.sub(r'(?<=\S) +<', "<", htmls)

    return htmls

测试

html = '''
<section id="main" data-role="page"
         class="container-fluid"
         style="
           background: #f0f0f0;
           padding: 20px;
           font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
         ">

  <header class="header-area"
          style="
            text-align: center;
            border-bottom: 2px solid #ccc;
            margin-bottom: 15px;
          ">
    <h1 style="font-size: 32px; color: #336699;">🔥 多语言支持 & Complex 标题 &nbsp;💡</h1>
    <p style="
      font-size: 14px;
      color: gray;
      margin-top: -10px;
    ">Welcome to the <strong>HTML Formatter</strong> demo!</p>
  </header>

  <div class="row content-area" style="display: flex; flex-direction: row;">
    <div class="col left" style="width: 50%; padding: 10px;">
      <p data-id="left-paragraph" style="line-height: 1.5em; color: #444;">
        This is a <em>sample</em> paragraph with <a href="https://example.com" target="_blank">a link</a>
        and a <span style="color: red;">red text</span> inside.
      </p>

      <ul class="list-group"
          style="
            list-style: none;
            padding-left: 0;
          ">
        <li class="list-item">Item 1</li>
        <li class="list-item">Item 2 with <strong>bold</strong> text</li>
        <li class="list-item">
          Item 3 with nested list:
          <ul>
            <li>Subitem A</li>
            <li>Subitem B</li>
          </ul>
        </li>
      </ul>
    </div>

    <div class="col right" style="width: 50%; padding: 10px;">
      <img src="image.jpg" alt="Example Image"
           style="
             width: 100%;
             border-radius: 8px;
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
           " />
      <form method="POST" action="/submit" style="margin-top: 20px;">
        <input type="text" name="username" placeholder="Enter username"
               style="width: 100%; padding: 8px;" />
        <input type="submit" value="Submit" style="margin-top: 10px;" />
      </form>
    </div>
  </div>

  <footer style="
    margin-top: 40px;
    text-align: center;
    font-size: 12px;
    color: #999;
  ">
    &copy; 2025 Web Formatter Inc. All rights reserved.
  </footer>
</section>
'''

print("---- depth = 1 ----")
print(format_html_single_line_blocks(html, max_depth=0))
print("---- depth = 2 ----")
print(format_html_single_line_blocks(html, max_depth=1000))
---- depth = 1 ----
<section id="main" data-role="page" class="container-fluid" style="background: #f0f0f0; padding: 20px; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;"><header class="header-area" style=" text-align: center; border-bottom: 2px solid #ccc; margin-bottom: 15px; "><h1 style="font-size: 32px; color: #336699;">🔥 多语言支持 &amp; Complex 标题 💡</h1><p style=" font-size: 14px; color: gray; margin-top: -10px; ">Welcome to the<strong>HTML Formatter</strong>demo!</p></header><div class="row content-area" style="display: flex; flex-direction: row;"><div class="col left" style="width: 50%; padding: 10px;"><p data-id="left-paragraph" style="line-height: 1.5em; color: #444;">This is a<em>sample</em>paragraph with<a href="https://example.com" target="_blank">a link</a>and a<span style="color: red;">red text</span>inside.</p><ul class="list-group" style=" list-style: none; padding-left: 0; "><li class="list-item">Item 1</li><li class="list-item">Item 2 with<strong>bold</strong>text</li><li class="list-item">Item 3 with nested list:<ul><li>Subitem A</li><li>Subitem B</li></ul></li></ul></div><div class="col right" style="width: 50%; padding: 10px;"><img alt="Example Image" src="image.jpg" style=" width: 100%; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); "/><form action="/submit" method="POST" style="margin-top: 20px;"><input name="username" placeholder="Enter username" style="width: 100%; padding: 8px;" type="text"/><input style="margin-top: 10px;" type="submit" value="Submit"/></form></div></div><footer style=" margin-top: 40px; text-align: center; font-size: 12px; color: #999; ">© 2025 Web Formatter Inc. All rights reserved.</footer></section>
---- depth = 2 ----
<section id="main" data-role="page" class="container-fluid" style="background: #f0f0f0; padding: 20px; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
  <header class="header-area" style="text-align: center; border-bottom: 2px solid #ccc; margin-bottom: 15px;">
    <h1 style="font-size: 32px; color: #336699;">🔥 多语言支持 &amp; Complex 标题 💡</h1>
    <p style="font-size: 14px; color: gray; margin-top: -10px;">
      Welcome to the
      <strong>HTML Formatter</strong>
      demo!
    </p>
  </header>
  <div class="row content-area" style="display: flex; flex-direction: row;">
    <div class="col left" style="width: 50%; padding: 10px;">
      <p data-id="left-paragraph" style="line-height: 1.5em; color: #444;">
        This is a
        <em>sample</em>
        paragraph with
        <a href="https://example.com" target="_blank">a link</a>
        and a
        <span style="color: red;">red text</span>
        inside.
      </p>
      <ul class="list-group" style="list-style: none; padding-left: 0;">
        <li class="list-item">Item 1</li>
        <li class="list-item">
          Item 2 with
          <strong>bold</strong>
          text
        </li>
        <li class="list-item">
          Item 3 with nested list:
          <ul>
            <li>Subitem A</li>
            <li>Subitem B</li>
          </ul>
        </li>
      </ul>
    </div>
    <div class="col right" style="width: 50%; padding: 10px;">
      <img alt="Example Image" src="image.jpg" style=" width: 100%; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); "/>
      <form method="POST" action="/submit" style="margin-top: 20px;">
        <input name="username" placeholder="Enter username" style="width: 100%; padding: 8px;" type="text"/>
        <input style="margin-top: 10px;" type="submit" value="Submit"/>
      </form>
    </div>
  </div>
  <footer style=" margin-top: 40px; text-align: center; font-size: 12px; color: #999; ">© 2025 Web Formatter Inc. All rights reserved.</footer>
</section>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

放飞自我的Coder

你的鼓励很棒棒哦~

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值