I recently encountered a similar problem, although my pdf file had a slightly simpler structure.
PDFMiner "" pdf. PDFPageAggregator, . , . TextConverter, XMLConverter HTMLConverter ( , ), .
TextConverter ( PDFPageAggregator) , . , . pdf () , , (, , ) . , , ,
, y- .
(, , ) receive_layout, . . , - :
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTPage, LTChar, LTAnno, LAParams, LTTextBox, LTTextLine
class PDFPageDetailedAggregator(PDFPageAggregator):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.rows = []
self.page_number = 0
def receive_layout(self, ltpage):
def render(item, page_number):
if isinstance(item, LTPage) or isinstance(item, LTTextBox):
for child in item:
render(child, page_number)
elif isinstance(item, LTTextLine):
child_str = ''
for child in item:
if isinstance(child, (LTChar, LTAnno)):
child_str += child.get_text()
child_str = ' '.join(child_str.split()).strip()
if child_str:
row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str)
self.rows.append(row)
for child in item:
render(child, page_number)
return
render(ltpage, self.page_number)
self.page_number += 1
self.rows = sorted(self.rows, key = lambda x: (x[0], -x[2]))
self.result = ltpage
LTTextLine , , , . - :
from pprint import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
fp = open('pdf_doc.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize('password')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
device.get_result()
pprint(device.rows)
device.rows , y-. y- , ..
pdf-, , . , PDFMiner . , , ( -W pdf2text.py). ( ) API PDFMiner, PDFMiner, github. (, , : "<, , Google )