番外话
被PDF折磨了两年多,今天终于找到一个比较好的解析方案,是用pdfplumber包解析的,并做了一些封装。之前用pdfminer解析的效果很一般,提取效果无法忍受的那种。把解析方法写出来后,我自己也是老泪纵横,给大家分享一下
注意事项
pdfplumber是对pdfminer的再封装,最好先安装pdfminer3k,再安装pdfplumber,否则代码很容易报错
源码
# -*- coding: utf-8 -*-
# @Author: 二师父
# @Vtime: 2020-09-04
# @Describe: 解析PDF
# @Blog: https://blog.csdn.net/weixin_39181440/article/details/108404354
# @AT: THS
# @Tips: pdfplumber是对pdfminer的再封装,最好先安装pdfminer3k,再安装pdfplumber,否则代码很容易报错
import pdfplumber
import re, time, os
# <--------------------------------------------------------------------------
# 老版本解析PDF
def parsepdf_old(path_or_url, proxies=None):
# 采取惰性加载
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
'''
相对升级版,弱爆了,其余代码有兴趣可自行百度类似版本
'''
return '哈哈哈'
# -------------------------------------------------------------------------->
# <--------------------------------------------------------------------------
# 解析PDF
def parsepdf(path_or_url, mode=1, url_params=None, proxies=None, save_as=None):
'''
<语法>
参数path_or_url: PDF文档路径或者URL
参数mode: 设置解析模式,
[1, '1', 'text']返回文档内容 -> str
[2, '2', 'table']返回表格信息 -> list
[3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple
参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict
参数proxies: 读取在线PDF文档时,传入requests的代理
参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用
</语法>
'''
url_mode = False
# 判断是本地文档还是在线文档
if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url):
url_mode = True
else:
pdf_path = path_or_url
if url_mode:
import requests
pdf_url = path_or_url
headers_d = None
headers_d = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'}
if not proxies:
proxy_host = {}
if not url_params:
url_params = {}
url_params['headers'] = headers_d
url_params['data'] = None
url_params['params'] = None
url_params['proxies'] = proxies
if not url_params['headers']: url_params['headers'] = headers_d
if url_params['data'] or url_params['params']:
response = requests.post(pdf_url, **url_params)
else:
response = requests.get(pdf_url, **url_params)
# 写入临时文件再进行解析
pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf'
with open(pdf_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
pdf_path = os.path.abspath(pdf_path)
# 用pdfplumber对pdf文档进行解析
pdf_text = ''
pdf_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
if str(mode).lower() in ['1', 'text', '0', '3']:
pdf_text += page.extract_text()
if str(mode).lower() in ['2', 'table', '0', '3']:
pdf_tables += page.extract_tables()
# 删除临时pdf文档
if url_mode and not save_as:
try:
os.remove(pdf_path)
except Exception as e:
pass
if str(mode).lower() in ['1', 'text']:
return pdf_text
elif str(mode).lower() in ['2', 'table']:
return pdf_tables
elif str(mode).lower() in ['3', 'text_and_table']:
return pdf_text, pdf_tables
# -------------------------------------------------------------------------->
if __name__=='__main__':
# path = 'http://www.srcb.com/res_base/srcb_com_www/upload/fund/image/2020_3/9_3/52kwkemfrs0y.pdf'
path = 'http://ewealth.abchina.com/fs/Information/OtherAnnouncements/202009/P020200902507638382691.pdf'
# a = parsepdf(path, mode=1, save_as='123.pdf')
# path = '123.pdf'
a = parsepdf(path, mode='TABLE')
import pandas as pd
print(pd.DataFrame(a[0]))
解析效果
标题