import os, time import pandas as pd def run(): MAX_ROWS_PER_SHEET = 100000 MAX_ROWS_PER_FILE = 1000000 start_time = time.time() dir = r'/Users/yizhuobai/Downloads/excel' DFs = [] file_num = 0 for root, dirs, files in os.walk(dir): files.sort() for file in files: if file.endswith('.xlsx'): file_num += 1 file_path = os.path.join(root, file) df = pd.read_excel(file_path, engine='openpyxl') DFs.append(df) alldata = pd.concat(DFs) alldata['发送时间'] = pd.to_datetime(alldata['发送时间']) print('文件读取完成,读取{} 开始排序'.format(file_num)) alldata = alldata.sort_values(by='发送时间') print('文件读取完成,读取{}'.format(file_num)) current_file_index = 1 current_sheet_index = 1 current_row_count = 0 current_file_row_count = 0 writer = pd.ExcelWriter(f'/Users/yizhuobai/Downloads/merge_{current_file_index}.xlsx', engine='openpyxl') for i in range(len(alldata)): if current_file_row_count >= MAX_ROWS_PER_FILE: writer.close() print('文件已满{}'.format(current_file_index)) current_file_index += 1 writer = pd.ExcelWriter(f'/Users/yizhuobai/Downloads/merge_{current_file_index}.xlsx', engine='openpyxl') current_sheet_index = 1 current_row_count = 0 current_file_row_count = 0 if current_row_count >= MAX_ROWS_PER_SHEET: print('文件 {} sheet 已满{}'.format(current_file_index, current_sheet_index)) current_sheet_index += 1 current_row_count = 0 if current_row_count == 0: print('文件 {} sheet {} 行 {}'.format(current_file_index, current_sheet_index, current_row_count)) alldata.iloc[[i]].to_excel(writer, sheet_name=f'Sheet{current_sheet_index}', index=False, header=True) else: print('文件 {} sheet {} 行 {}'.format(current_file_index, current_sheet_index, current_row_count)) alldata.iloc[[i]].to_excel(writer, sheet_name=f'Sheet{current_sheet_index}', index=False, header=False, startrow=current_row_count) current_row_count += 1 current_file_row_count += 1 writer.close() end_time = time.time() times = round(end_time - start_time, 2) print('合并完成,耗时{}秒'.format(times)) if __name__ == '__main__': run()
|