Python3读取pdf文档,输出内容(txt)
from urllib.request import urlopenfrom pdfminer.pdfinterp import PDFResourceManager,process_pdffrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom io import StringIOfrom io import openimport osimport redef readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return contentif __name__ == '__main__': # pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") filesdir = "D:\\0.shenma\\01.聊城资料\政府工作报告\\2019政府工作报告全文" os.chdir(filesdir) files = os.listdir() print(files) for file in files: if file.endswith(".pdf"): pdfFile = open(file, 'rb') outputString = readPDF(pdfFile) # print(outputString) try: outputString2 = outputString.replace("\n","") gdp = re.findall("生产总值(完成)?(.+?)亿元", outputString2)[0][1] print(file,"--","生产总值完成","--", gdp) ggyssr = re.findall("公共预算收入(完成)?(.+?),", outputString2)[0][1] print(file, "--", "一般公共预算收入完成","--", ggyssr) except: print(file, "--", "no data") # fh = open(file+".txt", 'w+', encoding="utf-8") # fh.write(outputString2) # fh.close() pdfFile.close()
【转自】:
仅做记录,供查。