import os
import sys
import win32com.client
import time
from pathlib import Path
def convert_to_docx(source_dir):
# 创建Word应用实例
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
word.DisplayAlerts = 0
# 计数器
total_files = 0
success_count = 0
failed_files = []
# 搜索所有wps和doc文件
file_extensions = ['.wps', '.doc']
all_files = []
print(f"开始搜索目录: {source_dir}")
# 遍历目录及子目录
for root, dirs, files in os.walk(source_dir):
for file in files:
ext = os.path.splitext(file)[1].lower()
if ext in file_extensions:
all_files.append(os.path.join(root, file))
total_files = len(all_files)
print(f"找到 {total_files} 个文件需要转换")
# 处理每个文件
for i, file_path in enumerate(all_files, 1):
try:
print(f"[{i}/{total_files}] 正在处理: {file_path}")
# 创建输出路径
output_path = os.path.splitext(file_path)[0] + ".docx"
# 如果目标文件已存在,添加时间戳
if os.path.exists(output_path):
timestamp = int(time.time())
file_name = os.path.splitext(os.path.basename(file_path))[0]
dir_name = os.path.dirname(file_path)
output_path = os.path.join(dir_name, f"{file_name}_{timestamp}.docx")
# 打开文档并保存为docx
doc = word.Documents.Open(file_path)
doc.SaveAs(output_path, 16) # 16对应于Word 2007-2013 (.docx)格式
doc.Close()
print(f"✓ 成功转换: {output_path}")
success_count += 1
except Exception as e:
print(f"✗ 转换失败: {file_path}")
print(f" 错误信息: {str(e)}")
failed_files.append((file_path, str(e)))
# 关闭Word应用
word.Quit()
# 打印总结
print("\n====== 转换完成 ======")
print(f"总文件数: {total_files}")
print(f"成功转换: {success_count}")
print(f"转换失败: {len(failed_files)}")
if failed_files:
print("\n失败文件列表:")
for file_path, error in failed_files:
print(f"- {file_path} (错误: {error})")
if __name__ == "__main__":
source_dir = r"C:\Users\SugarPPig\Desktop\DeepSeek\RAG"
if not os.path.exists(source_dir):
print(f"错误: 目录不存在 - {source_dir}")
sys.exit(1)
print(f"开始将WPS和DOC文件转换为DOCX格式...")
convert_to_docx(source_dir)
RAG 知识库文件上传前格式转换脚本
于 2025-04-30 10:42:46 首次发布