Python解析Pdf文件的简单方法
Python •
做Java web系统的时候,经常需要处理一些PDF文件,用Java来去读解析感觉麻烦,所以就想用Python来处理。Python果然是比Java少了很多代码。
还是直接上代码吧:
# !/usr/bin/python3
# -*- coding: utf-8 -*-
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
import io
def pdf2txt(path: str):
output = io.StringIO()
with open(path, 'rb') as f:
praser = PDFParser(f)
doc = PDFDocument(praser)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
pdfrm = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(pdfrm, laparams=laparams)
interpreter = PDFPageInterpreter(pdfrm, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if hasattr(x, "get_text"):
content = x.get_text()
output.write(content)
content = output.getvalue()
output.close()
return content
print(pdf2txt('C:\\Users\\Administrator\\Desktop\\test.pdf'))