完整版代码,从pdf提取文本、进行清洗、统计字数等,还有可以优化的地方,有时间了再重构,后期可以整合为一个框架。先记着。

# encoding: utf-8

"""
    author: Gupern 
    purpose: extract text from pdf, clean useless text and count words.
    usage: 
        0. windows + python3.6.2
        1. pip install pdfminer.six
        2. put this py script and all pdf files in one dirctory
        3. run 'python pdf2txtNew.py'
        4. it will generate foo.txt, fooClean.txt according to foo.pdf
            foo.txt: text from pdf.
            fooClean.txt: clean text from foo.txt
    possible problems:
        1. there are four type in your test pdfs, and I write 4 solution for them.
           So if you have more types, it may not work.
        2. I manipulate text(pdf) according to the characteristics I found from 
           test pdfs you gave me, so it may not work because the rule I use may not 
           work for all this kind of pdfs.
    if you have any problem, please contact me, my oicq is four five three one seven 
    two one 0 nine.
"""
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os 
from os import path
import re 
import collections


def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text


def saveTxt(txt, filename):
    with open(filename[:-3]+'txt', "w") as f:
        print('openTxt:' + filename[:-3]+'txt')
        f.write(txt)


# 将统计写入第一行
def add_words_count(cleanFilenameFull):
    dictTmp = (collections.Counter(re.findall('\w+', open(cleanFilenameFull).read())))
    count_all_words = 0
    for index,key in enumerate(dictTmp):
        count_all_words += dictTmp[key]
        # print(key, dictTmp[key])
    print(count_all_words)
    line_prepender(cleanFilenameFull, 'count_all_words is: ' + str(count_all_words))


# 写入第一行
def line_prepender(filename, line):
    with open(filename, 'r+') as f:
        content = f.read()        
        f.seek(0, 0)        
        f.write(line.rstrip('\r\n') + '\n' + content)


def extractTextFromPDF(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
                # txt = readPDF(filenameFull)
                txt = convert_pdf_to_txt(filenameFull)           
                saveTxt(txt.replace(u'\xa9', u' ').replace(u'\xa0',u' ').replace(u'\xad',u' ').replace(u'\u037e',u' '), filenameFull)


def cleanDataAndCountWords(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('txt')):
                with open(filenameFull,'r') as f:
                    for line in f.readlines():
                        # type1的标志
                        if 'D I S C L A I M E R' in line:
                            print(filenameFull + ' is type 1\n')
                            process_type_1(filenameFull)
                            break
                        # type2的标志
                        if 'CCBN, Inc' in line:
                            print(filenameFull + ' is type 2\n')
                            process_type_2(filenameFull)
                            break
                        # type3的标志
                        if 'Seeking Alpha' in line:
                            print(filenameFull + ' is type 3\n')
                            process_type_3(filenameFull)
                            break
                        # type4的标志
                        if 'Factiva' in line:
                            print(filenameFull + ' is type 4\n')
                            process_type_4(filenameFull)
                            break


def process_type_1(filenameFull):
    print('this is processing type 1' + filenameFull)
    begin = False 
    end = False
    delete = False 
    deleteLine = 0
    cleanFilenameFull = filenameFull[:-4] + 'Clean.txt'
    cleanFile = open(cleanFilenameFull, 'w+')
    with open(filenameFull) as f:
        for line in f.readlines():
            # if 'Q U E S T I O N S   A N D   A N S W E R S' in line:
            if not begin and 'P R E S E N T A T I O N' in line:
                print('beginning...')
                begin = True
            if not end and 'D I S C L A I M E R' in line:
                print('endding...')
                end = True
                break
            if begin and 'Thomson Financial' in line:
                delete = True 
            if delete and deleteLine < 14:
                deleteLine += 1
                continue
            elif deleteLine >= 14:
                delete = False 
                deleteLine = 0
            if (begin and not end):
                cleanFile.write(line)
                pass
    cleanFile.close()
    add_words_count(cleanFilenameFull)


def process_type_2(filenameFull):
    print('this is processing type 2' + filenameFull)
    begin = False 
    end = False
    delete = False 
    cleanFilenameFull = filenameFull[:-4] + 'Clean.txt'
    cleanFile = open(cleanFilenameFull, 'w+')
    with open(filenameFull) as f:
        for line in f.readlines():
            if not begin and 'OPERATOR:' in line:
                print('beginning...')
                begin = True
            if not end and 'Thomson Financial reserves the right to make changes to documents' in line:
                print('endding...')
                end = True
                break
            if begin and ('Fair Disclosure' in line or 'Page' in line or 'Thursday' in line or 'Wednesday' in line or 'Friday' in line or 'Monday' in line or 'Tuesday' in line):
                delete = True 
            if delete:
                if ('Fair Disclosure' in line or 'Page' in line or 'Thursday' in line or 'Wednesday' in line or 'Friday' in line or 'Monday' in line or 'Tuesday' in line):
                    delete = False
                continue
            if (begin and not end):
                if 'Page' in line or 'Wednesday\n'==line or 'Thursday\n'==line or 'Tuesday\n'==line or'Monday\n'==line or 'Friday\n'==line:
                    pass
                cleanFile.write(line)

    cleanFile.close()
    add_words_count(cleanFilenameFull)


def process_type_3(filenameFull):
    print('this is processing type 3' + filenameFull)
    begin = False 
    end = False
    cleanFilenameFull = filenameFull[:-4] + 'Clean.txt'
    cleanFile = open(cleanFilenameFull, 'w+')
    with open(filenameFull) as f:
        for line in f.readlines():
            # if 'Q U E S T I O N S   A N D   A N S W E R S' in line:
            if not begin and 'Operator' in line:
                print('beginning...')
                begin = True
            if not end and 'Copyright policy' in line:
                print('endding...')
                end = True
                break
            if 'http://' in line and '?part=sinlge' in line:
                continue
            if re.match(r'\d+/\d+/d+',line):
                continue
            elif re.match(r'\d+/\d+',line):
                continue
            if '| Seeking Alpha' in line:
                continue
            if (begin and not end):
                cleanFile.write(line)
                pass
    cleanFile.close()
    add_words_count(cleanFilenameFull)

def process_type_4(filenameFull):
    print('this is processing type 4' + filenameFull)
    begin = False 
    end = False
    delete = False 
    deleteLine = 0
    cleanFilenameFull = filenameFull[:-4] + 'Clean.txt'
    cleanFile = open(cleanFilenameFull, 'w+')
    with open(filenameFull) as f:
        for line in f.readlines():
            # if 'Q U E S T I O N S   A N D   A N S W E R S' in line:
            if not begin and 'OPERATOR:' in line:
                print('beginning...')
                begin = True
            if not end and 'Thomson Financial reserves the right to make changes to documents' in line:
                print('endding...')
                end = True
                break
            if (begin and not end):
                cleanFile.write(line)
                pass
    cleanFile.close()
    add_words_count(cleanFilenameFull)

if __name__=='__main__':
    rootdir = './'
    extractTextFromPDF(rootdir)
    cleanDataAndCountWords(rootdir)

# encoding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os 
from os import path
import re 
import collections

def readPDF(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text



def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text

def saveTxt(txt, filename):
    with open(filename[:-3]+'txt', "w") as f:
        print('openTxt:' + filename[:-3]+'txt')
        f.write(txt)


def traversal(rootdir):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('pdf') or filenameFull.endswith('PDF')):
                # txt = readPDF(filenameFull)
                txt = convert_pdf_to_txt(filenameFull)           
                saveTxt(txt.replace(u'\xa9', u' ').replace(u'\xa0',u' ').replace(u'\xad',u' ').replace(u'\u037e',u' '), filenameFull)


def process_txt():
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            filenameFull = os.path.join(parent, filename)
            if (filenameFull.endswith('txt')):
                with open(filenameFull,'r') as f:
                    for line in f.readlines():
                        # print(line)
                        # type1的无用字段
                        # 结束的标志 + type1的标志
                        if 'D I S C L A I M E R' in line:
                            print(filenameFull + ' is type 1\n')
                            process_type_1(filenameFull)
                            break
                        ## type2的标志
                        #if 'CCBN, Inc' in line:
                        #    print(filenameFull + ' is type 2\n')
                        #    process_type_2(filenameFull)
                        #    break
                        ## type3的标志
                        #if 'Seeking Alpha' in line:
                        #    print(filenameFull + ' is type 3\n')
                        #    process_type_3(filenameFull)
                        #    break
                        #if 'Factiva' in line:
                        #    print(filenameFull + ' is type 4\n')
                        #    process_type_4(filenameFull)
                        #    break

def process_type_1(filenameFull):
    print('this is processing type 1' + filenameFull)
    begin = False 
    end = False
    delete = False 
    deleteLine = 0
    cleanFilenameFull = filenameFull[:-4] + 'Clean.txt'
    cleanFile = open(cleanFilenameFull, 'w+')
    with open(filenameFull) as f:
        for line in f.readlines():
            # only one beginning  包不包括演讲?
            # if 'Q U E S T I O N S   A N D   A N S W E R S' in line:
            if 'P R E S E N T A T I O N' in line:
                print('beginning...')
                begin = True
            # only one endding
            if 'D I S C L A I M E R' in line:
                print('endding...')
                end = True
                break
            if begin and 'Thomson Financial' in line:
                #print('deleting...')
                delete = True 
            if delete and deleteLine < 14:
                deleteLine += 1
                #print(deleteLine,line)
                continue
            elif deleteLine >= 14:
                delete = False 
                deleteLine = 0
            # print line 
            if (begin and not end):
                # print(line)
                cleanFile.write(line)
                pass
    cleanFile.close()
    add_words_count(cleanFilenameFull)

# 将统计写入第一行
def add_words_count(cleanFilenameFull):
    dictTmp = (collections.Counter(re.findall('\w+', open(cleanFilenameFull).read())))
    count_all_words = 0
    for index,key in enumerate(dictTmp):
        count_all_words += dictTmp[key]
        # print(key, dictTmp[key])
    print(count_all_words)
    line_prepender(cleanFilenameFull, 'count_all_words is: ' + str(count_all_words))


# 写入第一行
def line_prepender(filename, line):
    with open(filename, 'r+') as f:
        content = f.read()        
        f.seek(0, 0)        
        f.write(line.rstrip('\r\n') + '\n' + content)



def process_type_2(filenameFull):
    print('this is processing type 1' + filenameFull)
def process_type_3(filenameFull):
    print('this is processing type 1' + filenameFull)
def process_type_4(filenameFull):
    print('this is processing type 1' + filenameFull)

def count_txt_words(file_name):
    lines_count = 0
    words_count = 0
    chars_count = 0
    words_dict  = {}
    lines_list   = []
    with open(file_name, 'r') as f:
        for line in f:
            lines_count = lines_count + 1
            chars_count  = chars_count + len(line)
            match = re.findall(r'[^a-zA-Z0-9]+', line)
            for i in match:
                # 只要英文单词,删掉其他字符
                line = line.replace(i, ' ')
            lines_list = line.split()
            for i in lines_list:
                if i not in words_dict:
                    words_dict[i] = 1
                else:
                    words_dict[i] = words_dict[i] + 1

    print('words_count is', len(words_dict))
    print('lines_count is', lines_count)
    print('chars_count is', chars_count)

    for k,v in words_dict.items():
        print(k,v)

if __name__=='__main__':
    rootdir = './'
#    traversal(rootdir)
    process_txt()

参考链接