前言
刚刚用的是pdfminer3k的版本,解析某个pdf的时候出现了一个问题,如图:
于是继续深入,发现pdfminer3k这个包已经很久远了,再找到一个pdfminer.six
的包,并进行代码编写。
- 注意,要把pdfminer3k卸载了才能用pdfminer.six,总之两个不兼容。
找到了一个代码:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
再把他整合到上一篇的pdf2txt.pdf
中。代码如下:
# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
from os import path
def readPDF(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
def saveTxt(txt, filename):
with open(filename[:-3]+'txt', "w") as f:
print('openTxt:' + filename[:-3]+'txt')
f.write(txt)
def traversal(rootdir):
for parent, dirnames, filenames in os.walk(rootdir):
for filename in filenames:
filenameFull = os.path.join(parent, filename)
if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
txt = readPDF(filenameFull)
saveTxt(txt.replace(u'\xa9', u'').replace(u'\xa0',u'').replace(u'\xad',u'').replace(u'\u037e',u''), filenameFull)
if __name__=='__main__':
rootdir = './'
traversal(rootdir)
新问题 空格解析不出来
突然发现空格解析不出来,继续深入查找,找到一篇帖子,用了他的代码,记录下。
(在stackoverflow里的帖子,最优的不一定就好的,还得看日期。)
最新更新,是因为把方块替换成”了,应该替换成空格。
# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
from os import path
def readPDF(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
def convert_pdf_to_txt(path, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(path, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
def saveTxt(txt, filename):
with open(filename[:-3]+'txt', "w") as f:
print('openTxt:' + filename[:-3]+'txt')
f.write(txt)
def traversal(rootdir):
for parent, dirnames, filenames in os.walk(rootdir):
for filename in filenames:
filenameFull = os.path.join(parent, filename)
if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
# txt = readPDF(filenameFull)
txt = convert_pdf_to_txt(filenameFull)
# 原因是这里,把方块替换成''了,应该替换成空格。
saveTxt(txt.replace(u'\xa9', u'').replace(u'\xa0',u'').replace(u'\xad',u'').replace(u'\u037e',u''), filenameFull)
if __name__=='__main__':
rootdir = './'
traversal(rootdir)