您的位置:首页 > 技术中心 > 其他 >

如何使用Python批量处理PDF文件并计算自定义关键词的出现次数?

时间:2023-04-20 20:16

    函数模块介绍

    具体的代码可见全部代码部分,这部分只介绍思路和相应的函数模块

    对文件进行批量重命名

    因为文件名是中文,且无关于最后的结果,所以批量命名为数字
    注意如果不是第一次运行,即已经命名完成,就在主函数内把这个函数注释掉就好了

    def rename():    path='dealPdf'    filelist=os.listdir(path)    for i,files in enumerate(filelist):        Olddir=os.path.join(path,files)        if os.path.isdir(Olddir):            continue        Newdir=os.path.join(path,str(i+1)+'.pdf')        os.rename(Olddir,Newdir)

    将PDF转化为txt

    PDF是无法直接进行文本分析的,所以需要将文字转成txt文件(PDF中图内的文字无法提取)

    #将pdf文件转化成txt文件def pdf_to_txt(dealPdf,index):    # 不显示warning    logging.propagate = False    logging.getLogger().setLevel(logging.ERROR)    pdf_filename = dealPdf    device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())    interpreter = PDFPageInterpreter(PDFResourceManager(), device)        parser = PDFParser(open(pdf_filename, 'rb'))    doc = PDFDocument(parser)            txt_filename='dealTxt\'+str(index)+'.txt'            # 检测文档是否提供txt转换,不提供就忽略    if not doc.is_extractable:        raise PDFTextExtractionNotAllowed    else:        with open(txt_filename, 'w', encoding="utf-8") as fw:            #print("num page:{}".format(len(list(doc.get_pages()))))            for i,page in enumerate(PDFPage.create_pages(doc)):                interpreter.process_page(page)                # 接受该页面的LTPage对象                layout = device.get_result()                # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象                # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等                # 想要获取文本就获得对象的text属性,                for x in layout:                    if isinstance(x, LTTextBoxHorizontal):                        results = x.get_text()                        fw.write(results)

    删除txt中的换行符

    因为PDF导出的txt会用换行符换行,为了避免词语因此拆开,所以删除所有的换行符

    #对txt文件的换行符进行删除def delete_huanhangfu(dealTxt,index):    outPutString=''    outPutTxt='outPutTxt\'+str(index)+'.txt'    with open(dealTxt,'r',encoding="utf-8") as f:        lines=f.readlines()        for i in range(len(lines)):            if lines[i].endswith('
    '):                lines[i]=lines[i][:-1] #将字符串末尾的
    去掉        for j in range(len(lines)):            outPutString+=lines[j]    with open(outPutTxt,'w',encoding="utf-8") as fw:        fw.write(outPutString)

    添加自定义词语

    此处可以根据自己的需要自定义,传入的wordsByMyself是全局变量

    分词与词频统计

    调用jieba进行分词,读取通用词表去掉停用词(此步其实可以省略,对最终结果影响不大),将词语和出现次数合成为键值对,输出关键词出现次数

    #分词并进行词频统计def cut_and_count(outPutTxt):    with open(outPutTxt,encoding='utf-8') as f:         #step1:读取文档并调用jieba分词        text=f.read()         words=jieba.lcut(text)        #step2:读取停用词表,去停用词        stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])        finalwords = []        for word in words:            if word not in stopwords:                if (word != "。" and word != ",") :                    finalwords.append(word)                               #step3:统计特定关键词的出现次数        valuelist=[0]*len(wordsByMyself)        counts=dict(zip(wordsByMyself,valuelist))        for word in finalwords:            if len(word) == 1:#单个词不计算在内                continue            else:                counts[word]=counts.get(word,0)+1#遍历所有词语,每出现一次其对应值加1        for i in range(len(wordsByMyself)):            if wordsByMyself[i] in counts:                print(wordsByMyself[i]+':'+str(counts[wordsByMyself[i]]))            else:                print(wordsByMyself[i]+':0')

    主函数

    通过for循环进行批量操作

    if __name__ == "__main__":    #rename()       for i in range(1,fileNum+1):        pdf_to_txt('dealPdf\'+str(i)+'.pdf',i)#将pdf文件转化成txt文件,传入文件路径         delete_huanhangfu('dealTxt\'+str(i)+'.txt',i)#对txt文件的换行符进行删除,防止词语因换行被拆分        word_by_myself()#添加自定义词语        print(f'----------result {i}----------')        cut_and_count('outPutTxt\'+str(i)+'.txt')#分词并进行词频统计,传入文件路径

    本地文件结构

    python怎么批量处理PDF文档输出自定义关键词的出现次数

    全部代码

    import jiebaimport jieba.analysefrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import PDFPageAggregatorfrom pdfminer.layout import LTTextBoxHorizontal, LAParamsfrom pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowedimport loggingimport oswordsByMyself=['社会责任','义务','上市','公司'] #自定义词语,全局变量fileNum=16#存储总共待处理的文件数量#重命名所有文件夹下的文件,适应处理需要def rename():    path='dealPdf'    filelist=os.listdir(path)    for i,files in enumerate(filelist):        Olddir=os.path.join(path,files)        if os.path.isdir(Olddir):            continue        Newdir=os.path.join(path,str(i+1)+'.pdf')        os.rename(Olddir,Newdir)#将pdf文件转化成txt文件def pdf_to_txt(dealPdf,index):    # 不显示warning    logging.propagate = False    logging.getLogger().setLevel(logging.ERROR)    pdf_filename = dealPdf    device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())    interpreter = PDFPageInterpreter(PDFResourceManager(), device)        parser = PDFParser(open(pdf_filename, 'rb'))    doc = PDFDocument(parser)            txt_filename='dealTxt\'+str(index)+'.txt'            # 检测文档是否提供txt转换,不提供就忽略    if not doc.is_extractable:        raise PDFTextExtractionNotAllowed    else:        with open(txt_filename, 'w', encoding="utf-8") as fw:            #print("num page:{}".format(len(list(doc.get_pages()))))            for i,page in enumerate(PDFPage.create_pages(doc)):                interpreter.process_page(page)                # 接受该页面的LTPage对象                layout = device.get_result()                # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象                # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等                # 想要获取文本就获得对象的text属性,                for x in layout:                    if isinstance(x, LTTextBoxHorizontal):                        results = x.get_text()                        fw.write(results)#对txt文件的换行符进行删除def delete_huanhangfu(dealTxt,index):    outPutString=''    outPutTxt='outPutTxt\'+str(index)+'.txt'    with open(dealTxt,'r',encoding="utf-8") as f:        lines=f.readlines()        for i in range(len(lines)):            if lines[i].endswith('
    '):                lines[i]=lines[i][:-1] #将字符串末尾的
    去掉        for j in range(len(lines)):            outPutString+=lines[j]    with open(outPutTxt,'w',encoding="utf-8") as fw:        fw.write(outPutString)            #添加自定义词语    def word_by_myself():    for i in range(len(wordsByMyself)):        jieba.add_word(wordsByMyself[i])#分词并进行词频统计def cut_and_count(outPutTxt):    with open(outPutTxt,encoding='utf-8') as f:         #step1:读取文档并调用jieba分词        text=f.read()         words=jieba.lcut(text)        #step2:读取停用词表,去停用词        stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])        finalwords = []        for word in words:            if word not in stopwords:                if (word != "。" and word != ",") :                    finalwords.append(word)                               #step3:统计特定关键词的出现次数        valuelist=[0]*len(wordsByMyself)        counts=dict(zip(wordsByMyself,valuelist))        for word in finalwords:            if len(word) == 1:#单个词不计算在内                continue            else:                counts[word]=counts.get(word,0)+1#遍历所有词语,每出现一次其对应值加1        for i in range(len(wordsByMyself)):            if wordsByMyself[i] in counts:                print(wordsByMyself[i]+':'+str(counts[wordsByMyself[i]]))            else:                print(wordsByMyself[i]+':0')#主函数 if __name__ == "__main__":    rename()       for i in range(1,fileNum+1):        pdf_to_txt('dealPdf\'+str(i)+'.pdf',i)#将pdf文件转化成txt文件,传入文件路径         delete_huanhangfu('dealTxt\'+str(i)+'.txt',i)#对txt文件的换行符进行删除,防止词语因换行被拆分        word_by_myself()#添加自定义词语        print(f'----------result {i}----------')        cut_and_count('outPutTxt\'+str(i)+'.txt')#分词并进行词频统计,传入文件路径

    结果预览

    python怎么批量处理PDF文档输出自定义关键词的出现次数

    以上就是如何使用Python批量处理PDF文件并计算自定义关键词的出现次数?的详细内容,更多请关注Gxl网其它相关文章!

    热门排行

    今日推荐

    热门手游