今天心学来潮,研究下python从pdf中提取文字到txt,涉及到遍历目录下所有文件,觉得挺有用的,记录一下。

依赖模块

  • pdfminer3k
    pip install pdfminer3k

这个包是python3专用的,一开始看找到的是pdfminer,但是其官网说只支持python2。

上网查阅了一些资料,还有个叫pypdf的包,详见下文参考文章部分。

代码

# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

import os
import os.path


def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content


def saveTxt(txt, filename):
    with open(filename[:-3]+'txt', "w") as f:
        print('openTxt:' + filename[:-3]+'txt')
        f.write(txt)


def traversal(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        print dirctory
        for dirname in dirnames:
            print('dirname is:' + dirname)
            print('parent folder is:' + parent)

        for filename in filenames:
            print('parent folder is:' + parent) 
            print('filename with full path:' + os.path.join(parent, filename))
            filenameFull = os.path.join(parent, filename)
            print(filenameFull.endswith('pdf'))
            if filenameFull.endswith('pdf'):
                txt = readPDF(open(filenameFull, 'rb'))
                saveTxt(txt.replace(u'\xa9', u''), filenameFull)

if __name__=='__main__':
    rootdir = './'
    traversal(rootdir)

参考文章