前言

刚刚用的是pdfminer3k的版本,解析某个pdf的时候出现了一个问题,如图:
pdfminer3k的问题

于是继续深入,发现pdfminer3k这个包已经很久远了,再找到一个pdfminer.six的包,并进行代码编写。

  • 注意,要把pdfminer3k卸载了才能用pdfminer.six,总之两个不兼容。

找到了一个代码:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

再把他整合到上一篇的pdf2txt.pdf中。代码如下:

# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os 
from os import path

def readPDF(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text



def saveTxt(txt, filename):
    with open(filename[:-3]+'txt', "w") as f:
        print('openTxt:' + filename[:-3]+'txt')
        f.write(txt)


def traversal(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
                txt = readPDF(filenameFull)
                saveTxt(txt.replace(u'\xa9', u'').replace(u'\xa0',u'').replace(u'\xad',u'').replace(u'\u037e',u''), filenameFull)

if __name__=='__main__':
    rootdir = './'
    traversal(rootdir)

新问题 空格解析不出来

突然发现空格解析不出来,继续深入查找,找到一篇帖子,用了他的代码,记录下。
(在stackoverflow里的帖子,最优的不一定就好的,还得看日期。)

最新更新,是因为把方块替换成''了,应该替换成空格。

# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os 
from os import path

def readPDF(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text



def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text

def saveTxt(txt, filename):
    with open(filename[:-3]+'txt', "w") as f:
        print('openTxt:' + filename[:-3]+'txt')
        f.write(txt)


def traversal(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
                # txt = readPDF(filenameFull)
                txt = convert_pdf_to_txt(filenameFull)
                # 原因是这里,把方块替换成''了,应该替换成空格。
                saveTxt(txt.replace(u'\xa9', u'').replace(u'\xa0',u'').replace(u'\xad',u'').replace(u'\u037e',u''), filenameFull)

if __name__=='__main__':
    rootdir = './'
    traversal(rootdir)

参考链接

Extracting text from a PDF file using PDFMiner in python?

How do I use pdfminer as a library