批量提取word文档标题
25-04-08 02:28
1124
0
import os import sys from collections import defaultdict from docx import Document from PyQt5.QtWidgets import (QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget, QLabel, QPushButton, QTextEdit, QFileDialog, QSpinBox, QGroupBox, QProgressBar) from PyQt5.QtCore import Qt from PyQt5.QtGui import QFont, QIcon class TitleExtractorApp(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("Word文档标题提取工具") self.setWindowIcon(QIcon('icon.png')) # 请准备一个图标文件或删除这行 self.setGeometry(100, 100, 800, 600) # 设置主窗口样式 self.setStyleSheet(""" QMainWindow { background-color: #f5f5f5; } QGroupBox { border: 1px solid #ccc; border-radius: 5px; margin-top: 10px; padding-top: 15px; font-size: 14px; } QGroupBox::title { subcontrol-origin: margin; left: 10px; padding: 0 3px; } QPushButton { background-color: #4CAF50; border: none; color: white; padding: 8px 16px; text-align: center; text-decoration: none; font-size: 14px; margin: 4px 2px; border-radius: 4px; } QPushButton:hover { background-color: #45a049; } QPushButton:pressed { background-color: #3e8e41; } QTextEdit { border: 1px solid #ccc; border-radius: 4px; padding: 8px; font-family: 'Segoe UI', Arial, sans-serif; } QSpinBox { padding: 5px; font-size: 14px; } """) self.init_ui() def init_ui(self): # 主布局 main_widget = QWidget() main_layout = QVBoxLayout() # 标题 title_label = QLabel("Word文档标题提取工具") title_label.setFont(QFont('Arial', 16, QFont.Bold)) title_label.setAlignment(Qt.AlignCenter) title_label.setStyleSheet("color: #333; margin-bottom: 20px;") # 设置组 settings_group = QGroupBox("提取设置") settings_layout = QHBoxLayout() # 文件夹选择 folder_layout = QVBoxLayout() self.folder_label = QLabel("未选择文件夹") self.folder_label.setStyleSheet("color: #666;") browse_button = QPushButton("选择文件夹") browse_button.clicked.connect(self.select_folder) folder_layout.addWidget(QLabel("文档文件夹:")) folder_layout.addWidget(self.folder_label) folder_layout.addWidget(browse_button) # 标题级别设置 level_layout = QVBoxLayout() level_label = QLabel("最大标题级别:") self.level_spin = QSpinBox() self.level_spin.setRange(1, 6) self.level_spin.setValue(3) level_layout.addWidget(level_label) level_layout.addWidget(self.level_spin) # 进度条 self.progress_bar = QProgressBar() self.progress_bar.setRange(0, 100) self.progress_bar.setValue(0) self.progress_bar.setTextVisible(False) # 添加到设置组 settings_layout.addLayout(folder_layout, 70) settings_layout.addLayout(level_layout, 30) settings_group.setLayout(settings_layout) # 操作按钮 button_layout = QHBoxLayout() extract_button = QPushButton("提取标题") extract_button.clicked.connect(self.extract_titles) extract_button.setStyleSheet("background-color: #2196F3;") save_button = QPushButton("保存结果") save_button.clicked.connect(self.save_results) save_button.setStyleSheet("background-color: #FF9800;") clear_button = QPushButton("清空结果") clear_button.clicked.connect(self.clear_results) clear_button.setStyleSheet("background-color: #f44336;") button_layout.addWidget(extract_button) button_layout.addWidget(save_button) button_layout.addWidget(clear_button) # 结果显示 result_group = QGroupBox("提取结果") result_layout = QVBoxLayout() self.result_text = QTextEdit() self.result_text.setReadOnly(True) result_layout.addWidget(self.result_text) result_group.setLayout(result_layout) # 组装主布局 main_layout.addWidget(title_label) main_layout.addWidget(settings_group) main_layout.addWidget(self.progress_bar) main_layout.addLayout(button_layout) main_layout.addWidget(result_group) main_widget.setLayout(main_layout) self.setCentralWidget(main_widget) # 状态栏 self.statusBar().showMessage("准备就绪") # 初始化变量 self.selected_folder = "" self.extracted_data = {} def select_folder(self): folder = QFileDialog.getExistingDirectory(self, "选择包含Word文档的文件夹") if folder: self.selected_folder = folder self.folder_label.setText(folder) self.statusBar().showMessage(f"已选择文件夹: {folder}") def extract_titles(self): if not self.selected_folder: self.statusBar().showMessage("请先选择文件夹!", 3000) return max_level = self.level_spin.value() self.result_text.clear() self.extracted_data = {} file_count = 0 # 统计文件数量用于进度条 total_files = sum(1 for _, _, files in os.walk(self.selected_folder) for f in files if f.endswith('.docx')) if total_files == 0: self.statusBar().showMessage("所选文件夹中没有找到Word文档!", 3000) return processed_files = 0 for root, _, files in os.walk(self.selected_folder): for filename in files: if filename.endswith('.docx'): file_path = os.path.join(root, filename) try: doc = Document(file_path) file_data = defaultdict(list) for p in doc.paragraphs: if p.style.name.startswith('Heading'): try: level = int(p.style.name.split()[1]) if level <= max_level: file_data[level].append(p.text) except (IndexError, ValueError): continue if file_data: self.extracted_data[filename] = file_data processed_files += 1 progress = int((processed_files / total_files) * 100) self.progress_bar.setValue(progress) except Exception as e: self.result_text.append(f"处理文件 {filename} 时出错: {str(e)}\n") # 显示结果 self.display_results() self.progress_bar.setValue(100) self.statusBar().showMessage(f"提取完成!共处理 {processed_files} 个文件", 5000) def display_results(self): self.result_text.clear() if not self.extracted_data: self.result_text.append("没有提取到任何标题数据") return for filename, levels in self.extracted_data.items(): self.result_text.append(f"=== {filename} ===") for level in sorted(levels.keys()): self.result_text.append(f"\n[标题 {level}]") for i, title in enumerate(levels[level], 1): self.result_text.append(f"{i}. {title}") self.result_text.append("\n") def save_results(self): if not self.extracted_data: self.statusBar().showMessage("没有可保存的数据!", 3000) return file_path, _ = QFileDialog.getSaveFileName(self, "保存结果", "", "文本文件 (*.txt)") if file_path: try: with open(file_path, 'w', encoding='utf-8') as f: for filename, levels in self.extracted_data.items(): f.write(f"=== {filename} ===\n") for level in sorted(levels.keys()): f.write(f"\n[标题 {level}]\n") for i, title in enumerate(levels[level], 1): f.write(f"{i}. {title}\n") f.write("\n") self.statusBar().showMessage(f"结果已保存到: {file_path}", 5000) except Exception as e: self.statusBar().showMessage(f"保存失败: {str(e)}", 5000) def clear_results(self): self.result_text.clear() self.extracted_data = {} self.progress_bar.setValue(0) self.statusBar().showMessage("已清空结果", 3000) if __name__ == "__main__": app = QApplication(sys.argv) # 设置全局字体 font = QFont() font.setFamily("Segoe UI") font.setPointSize(10) app.setFont(font) window = TitleExtractorApp() window.show() sys.exit(app.exec_())
-
电商经验 - 讨好快递小哥
分享给做电商朋友的一个经验,坐标浙江义务,每天来收件的快递小哥,我时不时给一包烟,每次来我只要在都会给红牛!只要他收的快递哪一家爆... 807 0 24-11-18 -
考研政治历年真题
1994年政治考研真题(文科)及参考答案.doc1994年政治考研真题(理科)及参考答案.doc1995年政治考研真题(文科)及参考答案.doc1995年政治考研真... 1149 0 24-08-20 -
Wise Care 365中文破解版 v6.7.2.646 精简优化版 (绿色单文件)
WiseCare365 是由WiseCleaner开发的一款用来管理,维护,配置以及解决电脑故障的适用于Windows操作系统的电脑的免费软件。其实Wise Care... 1446 0 24-07-10 -
3D小人素材ppt
648 0 21-07-12 -
档案管理表格 - 企业管理表格
备考表.doc档案索引图表.doc归档案卷目录.doc档案目录卡.doc档案明细表.doc档案内容登记簿.doc(机密)文件保管备查簿.doc作废档案焚毁清册... 1025 0 24-07-30 -
普通话水平测试专用教材音频
01 声母01 试卷一.mp302 试卷二.mp302 韵母03 声调03 试卷三.mp304 试卷四.mp304 轻声05 儿化05 试卷五.mp306 朗读短文06 试卷... 1165 0 24-08-20 -
杨任东竹石体
1027 0 24-07-24 -
社区论坛小程序源码
这是一款社区论坛小程序源码,内涵强大的功能,支持多种多样的发帖模式比如:发图文,发语音,发涂鸦,发视频等另外也可以设置为只能会员才可... 1156 0 24-06-19
发表我的评论
共0条评论
- 这篇文章还没有收到评论,赶紧来抢沙发吧~