|
|
发表于 2018-11-13 14:38:15
|
显示全部楼层
- from pdfminer.pdfinterp import PDFResourceManager,process_pdf
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from io import StringIO
- from io import open
- # 读取pdf的函数,返回内容
- def readPdf(pdf_file):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- laparams = LAParams()
- device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)
- process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)
- device.close()
- content = retstr.getvalue()
- retstr.close()
- return content
- if __name__=='__main__':
- pdffile=open(r'c:\temp\ilovethisgame.pdf','rb')
- s=readPdf(pdffile)
- import re
- from collections import Counter
- pattern=re.compile(r'\w+',re.RegexFlag.I)
- words=pattern.findall(s)
- words_count=Counter(words)
- print(words_count)
复制代码
把我的文件:c:\temp\ilovethisgame.pdf换成你自己的文件试试看。 |
|