Python 之大文件分块读取并多线程批量处理及结果保存-CSDN博客

本文链接：https://blog.csdn.net/TomorrowAndTuture/article/details/149907761

本文是对日常大文件读取和并发处理及结果保存做一个简单备忘记录。

文件示例

import os
import threading
import csv
import traceback
import datetime
import logging
from logging import handlers


def _logging(**kwargs):
    level = kwargs.pop('level', logging.DEBUG)
    filename = kwargs.pop('filename', 'default.log')
    datefmt = kwargs.pop('datefmt', '%Y-%m-%d %H:%M:%S')
    format = kwargs.pop('format', '[%(asctime)s,%(msecs)d][%(module)s][%(levelname)s] %(lineno)d - %(message)s')
    log = logging.getLogger(filename)
    format_str = logging.Formatter(format, datefmt)

    th = handlers.TimedRotatingFileHandler(filename=filename, when='MIDNIGHT', backupCount=30, encoding="utf-8")
    th.setFormatter(format_str)
    th.setLevel(level)

    log.addHandler(th)
    log.setLevel(level)
    return log


logger = _logging(filename="api.log")

root_dir = os.path.dirname(os.path.abspath(__file__))
in_path = os.path.join(root_dir, "in")
out_path = os.path.join(root_dir, "out")
os.makedirs(in_path, exist_ok=True)
os.makedirs(out_path, exist_ok=True)

thread_count = 10  # 线程数
process_count = 0  # 处理进度统计
lines_to_save = 100  # 线程每多少条数据保存一次文件
chunk_size = 10000  # 文件分片读取时设置的分片大小
file_delimiter = '\001'  # 文件分隔符

lock = threading.RLock()  # 初始化锁


# 测试往 in 目录的文件写入数据
def test_write_input():
    data = [
        {
            'u_id': "111",
            'code': "陕西省西安市雁塔区"
        },
        {
            'u_id': "222",
            'code': "四川省成都市武侯区"
        }
    ]
    csv_file = os.path.join(in_path, "input.csv")
    write_content(data, csv_file)


# 文件内容分片读取
def read_in_pages(file):
    try:
        with open(file, encoding='utf-8') as f:
            line = f.readline()
            contents = []
            while line:
                contents.append(line)
                if len(contents) == chunk_size:
                    yield contents
                    contents = []
                line = f.readline()
            if len(contents) > 0:
                yield contents
    except Exception as e:
        return []


# 处理结果追加写入
def write_content(data, csv_file):
    header = [
        'u_id',
        'code'
    ]
    try:
        # 多线程写入，写入操作需要加锁，避免相互覆盖
        with lock:
            logger.info("处理结果保存中...")
            with open(csv_file, 'a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, header, delimiter=file_delimiter)
                for row in data:
                    writer.writerow(row)
    except Exception as e:
        logger.error(f"文件 {csv_file} 保存报错")
        logger.error(traceback.format_exc())
    logger.info("文件保存完毕！")


# 将列表均分成 n 个子列表。如：[[1, 2, 3], [4, 5, 6], ...]
def split_list(lst, n):
    result = []
    length = len(lst)
    if length % n == 0:
        step = length // n
    else:
        step = length // n + 1
    for i in range(n):
        result.append(lst[i * step: (i + 1) * step])
    return result


# 单行处理（根据需要自定义处理方式）
def call_model(address):
    code = ""
    try:
        # do something with address, and return address code
        code = address[:6]
        return code
    except Exception as e:
        logger.error(traceback.format_exc())
    finally:
        return code


# 批量处理调用
def batch_call_model(contents, total_count, csv_file):
    global process_count
    output_data = []
    for content in contents:
        # 根据文件实际格式进行分隔解析获取字段
        u_id, address = content.split(file_delimiter)
        with lock:
            process_count += 1
            logger.info(f"数据处理进度：{process_count}/{total_count}")
            # if process_count % 10 == 0:
            #     logger.info(f"数据处理进度：{process_count}/{total_count}")
        address = address.strip()
        code = call_model(address)
        result = {
            "u_id": u_id,
            "code": code
        }
        output_data.append(result)
        if len(output_data) > lines_to_save:
            tmp_data = output_data[:lines_to_save]
            output_data = output_data[lines_to_save:]
            write_content(tmp_data, csv_file)
    write_content(output_data, csv_file)


def process():
    logger.info(f"=============== start process ==================")
    try:
        # 声明全局变量并重置为零值
        global process_count
        process_count = 0
        file_list = os.listdir(in_path)
        # 将 in_path 与文件名拼接，得到每个文件所对应的绝对路径,只有一个文件，所以取0
        if len(file_list) == 0:
            logger.info("输入文件夹为空，任务结束！")
            return
        file_path = os.path.join(in_path, file_list[0])
        iter_result = read_in_pages(file_path)
        #  如果输出文件已存在，则先进行删除
        csv_file = os.path.join(out_path, f"output.csv")
        if os.path.exists(csv_file):
            logger.info(f"{csv_file} 文件删除！")
            try:
                os.remove(csv_file)
            except Exception as e:
                pass
        write_content([], csv_file)
        chunk_no = 0
        total_count = 0
        start = datetime.datetime.now()
        logger.info(f"start: {start}")
        for item in iter_result:
            chunk_no += 1
            thread_list = []
            contents_list = item
            total_count += len(contents_list)
            logger.info(f"当前处理第 {chunk_no} 个分片， 分片大小为 {len(item)}，截止目前分片数据总数 {total_count} 条")
            contents_list = split_list(contents_list, thread_count)
            #  ===多线程并发调用===
            for contents in contents_list:
                t = threading.Thread(target=batch_call_model, args=[contents, total_count, csv_file])
                thread_list.append(t)
            for t in thread_list:
                t.daemon = True
                t.start()
            for t in thread_list:
                t.join()
        end = datetime.datetime.now()
        logger.info(f"end: {end}")
        logger.info(f"total time cost: {end - start}")
    except Exception as e:
        logger.error(traceback.format_exc())
    finally:
        logger.info(f"=============== end process ==================")


if __name__ == '__main__':
    test_write_input()
    process()