Python提取docx文档中例题、插图、表格清单

2016-07-21 Python小屋 Python小屋

from docx import Document

import re


result = {'li':[], 'fig':[], 'tab':[], 'tuozhan':[]}

doc = Document(r'C:\test.docx')


for p in doc.paragraphs:

    t = p.text #获取每一段的文本

    if re.match('例\d+-\d+ ', t):

        result['li'].append(t)

    elif re.match('图\d+-\d+ ', t):

        result['fig'].append(t)

    elif re.match('表\d+-\d+ ', t):

        result['tab'].append(t)


print('='*30)

for li in result['li']:

    print(li)

print('='*30)

for fig in result['fig']:

    print(fig)

print('='*30)

for tab in result['tab']:

    print(tab)