工作中遇到的问题:需要从PDF中提取表格数据,并数据落到MySQL库中。具体情况如下图所示:
目录页:

代码数据页:

使用python实现代码如下(水平有限,此代码仅供自用):
import pdfplumber
pdf = r'XXX.pdf'
wookroot = pdfplumber.open(pdf)
pages = wookroot.pages
table_text = ''
for page in pages[5:9]:
text = page.extract_text()
table_text += text
post_start = table_text.rfind(7 代码集)
post_end = table_text.rfind(附录1)
table_text = table_text[post_start:post_end]
tableName_list = []
for _ in range(111):
tableName = []
post1 = table_text.rfind(()
post2 = table_text.rfind())
post3 = table_text.rfind( )
tableName.append(table_text[post3 + 1:post1])
tableName.append(table_text[post1 + 1:post2])
tableName_list.append(tableName)
table_text = table_text[:post3 - 1]
tableName_list.sort()
print(tableName_list)
print(len(tableName_list))
value_list = []
for page in pages[112:166]:
tables = page.extract_tables()
for ele in tables:
for elele in ele:
value_list.append(elele)
value_list = value_list[10:]
print((value_list))
xh = []
for i, value in enumerate(value_list):
values =[]
if value == ['代码', '名称']:
xh.append(i)
print(len(xh))
value_list_new = []
for i in range(111):
if i != 110:
value_list_new.append(value_list[xh[i]:xh[i + 1]])
else:
value_list_new.append(value_list[xh[i]:])
print(len(value_list_new))
print(value_list_new)
file = open(output.sql, w)
for i, value in enumerate(value_list_new):
for j in value:
if j == ['代码', '名称']:
print(-- %s.dm_%s %s % ((i + 1), tableName_list[i][0].lower(), tableName_list[i][1]))
file.write(-- %s.dm_%s %s\r\n % ((i + 1), tableName_list[i][0].lower(), tableName_list[i][1]))
print(create table dm_%s(dm varchar(200) comment '代码', mc varchar(200) comment '名称') comment='%s'; \
% (tableName_list[i][0].lower(), tableName_list[i][1]))
file.write(drop table if exists dm_%s;\r\n % tableName_list[i][0].lower())
file.write(create table dm_%s(dm varchar(200) comment '代码', mc varchar(200) comment '名称') comment='%s';\r\n\
% (tableName_list[i][0].lower(), tableName_list[i][1]))
else:
print(insert into dm_%s values('%s','%s'); % (tableName_list[i][0].lower(), j[0], j[1]))
file.write(insert into dm_%s(dm, mc) values('%s','%s');\r\n % (tableName_list[i][0].lower(), j[0], j[1]))
file.close()
wookroot.close()