File size: 729 Bytes
6c945f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# -*-coding:utf-8 -*-
import PyPDF2
from build_index.parser.base import BaseParser


class PDFParser(BaseParser):
    def header_remove(self):
        # 删除研报的页头
        pass

    def footnote_remove(self):
        # 删除研报的页脚
        pass

    def parse_file(self, file):
        # store pages of
        text_list = []

        with open(file, "rb") as fp:
            pdf = PyPDF2.PdfReader(fp)
            num_pages = len(pdf.pages)
            for page in range(num_pages):
                page_text = pdf.pages[page].extract_text()
                text_list.append(page_text)
        text = '\n'.join(text_list)
        metadata = {'source': file, 'pages': num_pages}
        return text, metadata