from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd
def mainmain(num):
return [num, num + 1, num / 2]
num_list = list(range(1, 101))
# 使用 multiprocessing 的 Pool 来并行处理
with Pool(processes=15) as pool:
results = pool.map(mainmain, tqdm(num_list, desc="Processing"))
# 将结果转化为 DataFrame
df = pd.DataFrame(results, columns=['num', 'num_plus_1', 'num_half'])
# 打印 DataFrame
print(df)
def mainmain(file):
try:
xlsx_file = file.replace("pdf","xlsx")
if not os.path.exists(xlsx_file):
df = pdf2csv(file)
df.to_excel(xlsx_file,index=False)
except Exception as e:
# print(file, e)
with open('error_log.txt', 'a') as f: # 'a' 代表追加模式,如果文件不存在则创建
f.write(f"{file}\t{e}\n")
# traceback.print_exc(file=f) # 将完整的错误堆栈信息写入文件
# f.write("\n") # 可选:在日志后添加一个换行
if __name__ == "__main__":
pdf_path = './data/'
file_list = [os.path.join(pdf_path, f) for f in os.listdir(pdf_path) if os.path.isfile(os.path.join(pdf_path, f))]
with Pool(processes=15) as pool:
results = pool.map(mainmain, file_list)
解码pdf
def try_decrypt_password(pdf_path, password):
# 尝试密码解密并返回解密成功与否以及密码
with open(pdf_path, 'rb') as file:
reader = PdfReader(file)
if reader.decrypt(password):
return password # 返回解密成功的密码
return None # 解密失败
def pdf_decrypt(pdf_path):
# 创建一个密码范围,范围从 000000 到 999999
passwords = [str(i).zfill(6) for i in range(1000000)]
# 使用共享变量来通知所有进程停止
found_password = None
# 使用进程池来并行化密码尝试过程
with ProcessPoolExecutor(max_workers=20) as executor:
# 并行执行密码尝试
# results = list(tqdm(executor.map(try_decrypt_password, passwords), total=len(passwords)))
futures = {executor.submit(try_decrypt_password, pdf_path, password): password for password in passwords}
try:
for future in tqdm(as_completed(futures), total=len(passwords)):
result = future.result()
if result is not None:
found_password = result
print(f"解密成功的密码是: {result}")
for f in futures:
f.cancel()
break
except KeyboardInterrupt:
print("\n用户中断了进程")
for f in futures:
f.cancel()
return found_password if found_password is not None else "没有找到有效密码"
pdf_path = "xxx.pdf"
password = pdf_decrypt(pdf_path)
print(password)
with pdfplumber.open(pdf_path, password=password) as pdf:
first_page = pdf.pages[0]
text = first_page.extract_text()
print(text)