Python从PPT文件中提取所有文字到Word

阅读 107

2023-02-19

需求

将PPT文件或PPTX文件里面的所有文字提取到一个新的以​​docx​​结尾的Word文件中。

安装Python库

(1)基于Python 3

(2) 运行下方代码安装需要用到的库

pip install python-pptx
pip install python-docx

执行下方Python代码

import collections
import collections.abc
import sys
import os
from pptx import Presentation
import docx

def extract_text(input_file, output_file):
prs = Presentation(input_file)
doc = docx.Document()
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
doc.add_paragraph(run.text)
doc.save(output_file)

if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python main.py <input_file> <output_file>")
sys.exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
if not input_file.endswith(".pptx") or not output_file.endswith(".docx"):
print("Error: input file must be .ppt and output file must be .docx")
sys.exit()
if not os.path.exists(input_file):
print("Error: input file does not exist")
sys.exit()
extract_text(input_file, output_file)
print("Text extracted successfully!")






精彩评论(0)

0 0 举报