活动介绍
file-type

XPath详解:XML文档的强大查询语言

下载需积分: 50 | 139KB | 更新于2024-07-29 | 157 浏览量 | 0 下载量 举报 收藏
download 立即下载
XPath参考手册.doc 是一本详细的指南,旨在教授读者如何在XML文档中高效地查找和操作数据。XPath是一种强大的语言,它允许用户通过元素和属性的路径表达式在XML文档中导航,类似于文件系统的目录结构。这门语言是W3C标准的一部分,与XSLT(可扩展样式表语言转换)密切相关,也是XQuery和XPointer的基础。 该手册分为几个核心部分: 1. **XPath简介**:介绍了XPath的基本概念,它作为一种查询语言,用于在XML文档中定位特定的信息,通过路径表达式实现节点的选择和操作。 2. **XPath节点**:详细解释了XPath中的各种节点类型,如元素、属性、文本、命名空间等,以及它们之间的关系,这对于理解XPath如何在文档中定位元素至关重要。 3. **XPath语法**:深入解析XPath的语法结构,包括选择器、定位步骤、过滤器和组合表达式等,这些都是编写有效XPath查询的基础。 4. **XPath轴**:讲解XPath轴的概念,轴代表了从根节点到目标节点可能经过的不同路径,如child、descendant、ancestor等。 5. **XPath运算符**:列出了XPath中的一系列运算符,如等于、大于、小于、包含等,这些运算符用于构建更复杂的查询条件。 6. **XPath实例**:通过实际的“books.xml”文档,展示如何应用XPath解决实际问题,帮助读者理解和掌握XPath在实际工作中的应用。 7. **XPath函数**:介绍了XPath 2.0、XQuery 1.0和XSLT 2.0中的内置函数,这些函数提供了丰富的数据处理和转换能力,是XPath功能的强大补充。 8. **前置知识**:建议读者在开始学习XPath前掌握HTML/XHTML和XML的基本知识,特别是XML命名空间,这对于理解XPath在XML环境中的作用至关重要。 9. **XPath在XSLT中的使用**:强调XPath在XSLT文档开发中的核心地位,指出没有XPath知识将难以创建有效的XSLT样式表。 通过阅读这本参考手册,读者不仅可以学习XPath的基本概念和语法,还能深入理解其在XML处理和转换过程中的重要作用,从而提升在Web开发和数据处理中的技能。后续的学习路径可能包括进一步探索XSLT和相关的XML技术。

相关推荐

filetype

import os import re import shutil import zipfile import tempfile from typing import List, Dict, Tuple import tkinter as tk from tkinter import filedialog, messagebox from docx import Document from docx.shared import Pt from docx.oxml import parse_xml from docx.oxml.ns import nsdecls, qn from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml.shape import CT_Picture from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P class WordProcessor: def __init__(self): self.input_path = "" self.output_dir = "" self.full_titles = [] self.current_title_path = [] self.ignore_titles = ["目录", "目 录", "contents", "Contents"] self.chapter_counter = 0 self.image_counter = 1 self.media_files = {} # 存储图片文件 self.base_filename = "" # 基础文件名 self.source_doc = None # 存储原始文档对象 def select_input_file(self): """选择输入文件""" root = tk.Tk() root.withdraw() file_path = filedialog.askopenfilename( title="选择要处理的Word文档", filetypes=[("Word文档", "*.docx"), ("所有文件", "*.*")] ) if file_path: self.input_path = file_path self.base_filename = os.path.splitext(os.path.basename(file_path))[0] self._extract_media_files() # 提取文档中的图片文件 self.source_doc = Document(file_path) # 加载原始文档 return file_path def select_output_dir(self): """选择输出目录""" root = tk.Tk() root.withdraw() dir_path = filedialog.askdirectory(title="选择输出目录") if dir_path: self.output_dir = dir_path return dir_path def _extract_media_files(self): """从Word文档中提取图片文件""" self.media_files = {} with zipfile.ZipFile(self.input_path) as z: for file in z.namelist(): if file.startswith('word/media/'): self.media_files[file] = z.read(file) def is_title(self, paragraph, level=1): """判断段落是否是标题""" # 方法1:检查样式名称 if paragraph.style.name.startswith(f'Heading {level}'): return True # 方法2:检查格式特征 if level == 1 and paragraph.runs: run = paragraph.runs[0] if run.bold and run.font.size == Pt(16): return True # 方法3:检查文本模式 text = paragraph.text.strip() if level == 1 and re.match(r'^第[一二三四五六七八九十]+章', text): return True return False def should_ignore_paragraph(self, paragraph): """判断是否应该忽略此段落""" text = paragraph.text.strip() return (not text or text in self.ignore_titles or re.match(r'^\d+$', text)) def is_useless_image(self, paragraph): """判断是否是无用图片""" return "logo" in paragraph.text.lower() def is_useless_text(self, paragraph): """判断是否是无用文本""" pattern = r'[A-Z]{1,5}(\s*[/-]\s*[A-Z]{1,5})*\s*\d+(-\d+)*' return re.fullmatch(pattern, paragraph.text.strip()) def clean_document(self, doc): """清理文档中的无用内容""" # 清理段落 for paragraph in list(doc.paragraphs): if (self.should_ignore_paragraph(paragraph) or self.is_useless_image(paragraph) or self.is_useless_text(paragraph)): self._remove_element(paragraph._element) # 清理页眉页脚 for section in doc.sections: for paragraph in section.header.paragraphs: if self.is_useless_text(paragraph): self._remove_element(paragraph._element) for paragraph in section.footer.paragraphs: if self.is_useless_text(paragraph): self._remove_element(paragraph._element) return doc def _remove_element(self, element): """删除文档元素""" if element is not None and element.getparent() is not None: element.getparent().remove(element) def process_tables(self, doc): """处理续表""" tables = doc.tables i = 0 while i < len(tables): first_cell = tables[i].cell(0, 0).text.strip().lower() if "续表" in first_cell or "continued" in first_cell: if i > 0: self._merge_tables(tables[i-1], tables[i]) self._remove_element(tables[i]._element) i -= 1 # 因为删除了一个表格,索引需要调整 i += 1 return doc def _merge_tables(self, main_table, continued_table): """合并两个表格""" start_row = 1 if continued_table.rows[0].cells[0].text.strip().lower() in ["续表", "continued"] else 0 for row in continued_table.rows[start_row:]: new_row = main_table.add_row() for i, cell in enumerate(row.cells): new_row.cells[i].text = cell.text # 复制格式 if cell._element.tcPr is not None: new_row.cells[i]._element.tcPr = parse_xml(cell._element.tcPr.xml) def split_by_chapters(self): """按章节拆分文档""" doc = self.source_doc doc = self.clean_document(doc) doc = self.process_tables(doc) chapters = [] current_chapter = None current_chapter_title = None # 获取文档主体中的所有元素 body_elements = doc.element.body.xpath('*') for element in body_elements: if element.tag.endswith('p'): # 段落 paragraph = self._get_paragraph(doc, element) if paragraph is None: continue if self.should_ignore_paragraph(paragraph): continue if self.is_title(paragraph, level=1): if current_chapter is not None: chapters.append((current_chapter_title, current_chapter)) current_chapter_title = self._format_chapter_title(paragraph.text) current_chapter = Document() # 复制文档的核心样式 self._copy_core_styles(doc, current_chapter) current_chapter.add_heading(current_chapter_title, level=1) self.current_title_path = [current_chapter_title] self.chapter_counter += 1 continue if current_chapter is not None: self._copy_paragraph(current_chapter, paragraph) elif element.tag.endswith('tbl'): # 表格 if current_chapter is not None: self._copy_table(current_chapter, element) elif element.tag.endswith('drawing'): # 图片 if current_chapter is not None: self._copy_image(current_chapter, element) if current_chapter is not None: chapters.append((current_chapter_title, current_chapter)) return chapters def _copy_core_styles(self, source_doc, target_doc): """复制核心样式到目标文档""" # 复制默认段落样式 default_style = source_doc.styles['Normal'] target_style = target_doc.styles['Normal'] target_style.font.name = default_style.font.name target_style.font.size = default_style.font.size def _get_paragraph(self, doc, element): """获取段落对象""" for p in doc.paragraphs: if p._element == element: return p return None def _format_chapter_title(self, title): """格式化章节标题""" title = title.strip() if not re.match(r'^第[一二三四五六七八九十]+章', title): match = re.search(r'(第[一二三四五六七八九十]+章\s*.+)', title) if match: title = match.group(1) return title def _copy_paragraph(self, target_doc, source_paragraph): """复制段落及其内容""" new_para = target_doc.add_paragraph(style=source_paragraph.style) # 复制段落格式 new_para.paragraph_format.alignment = source_paragraph.paragraph_format.alignment new_para.paragraph_format.left_indent = source_paragraph.paragraph_format.left_indent new_para.paragraph_format.right_indent = source_paragraph.paragraph_format.right_indent new_para.paragraph_format.first_line_indent = source_paragraph.paragraph_format.first_line_indent new_para.paragraph_format.line_spacing = source_paragraph.paragraph_format.line_spacing new_para.paragraph_format.space_before = source_paragraph.paragraph_format.space_before new_para.paragraph_format.space_after = source_paragraph.paragraph_format.space_after # 复制run和图片 for run in source_paragraph.runs: new_run = new_para.add_run(run.text) new_run.bold = run.bold new_run.italic = run.italic new_run.underline = run.underline new_run.font.size = run.font.size new_run.font.name = run.font.name # 复制图片 if run._element.xpath('.//wp:inline'): self._copy_run_image(new_run, run) def _copy_run_image(self, new_run, source_run): """复制run中的图片""" drawing = source_run._element.xpath('.//wp:inline')[0] new_run._element.append(parse_xml(drawing.xml)) def _copy_table(self, target_doc, table_element): """复制表格""" new_table = target_doc.add_table(rows=1, cols=1) new_table._element = parse_xml(table_element.xml) # 确保表格中的图片引用正确 for row in new_table.rows: for cell in row.cells: for paragraph in cell.paragraphs: for run in paragraph.runs: if run._element.xpath('.//wp:inline'): self._copy_run_image(run, run) def _copy_image(self, target_doc, image_element): """复制独立图片""" para = target_doc.add_paragraph() run = para.add_run() run._element.append(parse_xml(image_element.xml)) def save_chapters(self, chapters): """保存章节并处理图片引用""" if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) saved_files = [] for idx, (title, chapter_doc) in enumerate(chapters): # 生成安全的文件名 safe_title = re.sub(r'[\\/*?:"<>|]', "_", title) # 使用基础文件名+章节标题作为文件名 filename = f"{self.base_filename}-{safe_title}.docx" filepath = os.path.join(self.output_dir, filename) # 临时保存以处理图片 temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') chapter_doc.save(temp_file.name) temp_file.close() # 处理图片引用 self._repack_docx_with_images(temp_file.name, filepath) os.unlink(temp_file.name) saved_files.append(filepath) return saved_files def _repack_docx_with_images(self, src_path, dest_path): """重新打包docx文件包含图片""" with zipfile.ZipFile(src_path, 'r') as zin: with zipfile.ZipFile(dest_path, 'w') as zout: # 复制所有文件 for item in zin.infolist(): if not item.filename.startswith('word/media/'): # 不复制原media文件 zout.writestr(item, zin.read(item.filename)) # 添加图片文件 for rel_path, data in self.media_files.items(): zout.writestr(rel_path, data) def _extract_chapter_number(self, title): """从标题中提取章节编号""" match = re.search(r'第([一二三四五六七八九十]+)章', title) if match: chinese_num = match.group(1) num_map = {'一':'1','二':'2','三':'3','四':'4','五':'5', '六':'6','七':'7','八':'8','九':'9','十':'10'} return num_map.get(chinese_num, None) return None def process_document(self): """处理文档主流程""" if not self.input_path or not os.path.exists(self.input_path): raise FileNotFoundError("输入文件路径无效或文件不存在") if not self.output_dir: raise ValueError("输出目录未指定") try: chapters = self.split_by_chapters() saved_files = self.save_chapters(chapters) return saved_files except Exception as e: raise Exception(f"处理文档时出错: {str(e)}") def main(): """主界面""" try: processor = WordProcessor() root = tk.Tk() root.title("Word文档处理工具 v3.2") root.geometry("650x450") # 界面布局 tk.Label(root, text="Word文档高级处理工具", font=("Arial", 16)).pack(pady=10) # 输入文件选择 input_frame = tk.Frame(root) input_frame.pack(pady=5, fill=tk.X, padx=20) tk.Label(input_frame, text="输入文件:").pack(side=tk.LEFT) input_entry = tk.Entry(input_frame, width=45) input_entry.pack(side=tk.LEFT, padx=5, expand=True, fill=tk.X) tk.Button(input_frame, text="浏览...", command=lambda: input_entry.insert(0, processor.select_input_file())).pack(side=tk.LEFT) # 输出目录选择 output_frame = tk.Frame(root) output_frame.pack(pady=5, fill=tk.X, padx=20) tk.Label(output_frame, text="输出目录:").pack(side=tk.LEFT) output_entry = tk.Entry(output_frame, width=45) output_entry.pack(side=tk.LEFT, padx=5, expand=True, fill=tk.X) tk.Button(output_frame, text="浏览...", command=lambda: output_entry.insert(0, processor.select_output_dir())).pack(side=tk.LEFT) # 处理按钮 def on_process(): processor.input_path = input_entry.get() processor.output_dir = output_entry.get() if not processor.input_path: messagebox.showerror("错误", "请先选择输入文件") return if not processor.output_dir: messagebox.showerror("错误", "请先选择输出目录") return try: saved_files = processor.process_document() messagebox.showinfo("成功", f"处理完成! 共生成 {len(saved_files)} 个子文档。\n" f"输出目录: {processor.output_dir}\n" f"第一个文件: {os.path.basename(saved_files[0])}") # 打开输出目录 if os.name == 'nt': # Windows os.startfile(processor.output_dir) elif os.name == 'posix': # macOS, Linux os.system(f'open "{processor.output_dir}"') except Exception as e: messagebox.showerror("错误", f"处理失败: {str(e)}") process_btn = tk.Button(root, text="开始处理", command=on_process, height=2, width=20, bg="#4CAF50", fg="white") process_btn.pack(pady=20) # 说明文本 info_frame = tk.Frame(root, borderwidth=1, relief="solid", padx=10, pady=10) info_frame.pack(pady=10, padx=20, fill=tk.BOTH, expand=True) tk.Label(info_frame, text="功能说明:", font=("Arial", 10, "bold")).pack(anchor="w") tk.Label(info_frame, text="1. 按章节拆分文档,保留原格式\n" "2. 完整保留所有表格和图片\n" "3. 自动处理续表合并\n" "4. 清理无用内容(logo、标准号等)\n" "5. 生成文件名格式: 原文件名-章节标题", justify=tk.LEFT, anchor="w").pack(fill=tk.X) tk.Label(info_frame, text="输出示例:\n" "高速公路清障施救标准化手册0901终-第一章 总则.docx\n" "高速公路清障施救标准化手册0901终-第二章 清障施救标准.docx", justify=tk.LEFT, anchor="w", fg="blue").pack(fill=tk.X, pady=(5,0)) root.mainloop() except Exception as e: messagebox.showerror("系统错误", f"程序发生错误: {str(e)}") if __name__ == "__main__": main() 我在执行上面代码的时候,被拆分的文档中所有的图片和表格都没有了,是什原因导致的我应该怎么修改

shihuaguo
  • 粉丝: 6
上传资源 快速赚钱