File size: 3,693 Bytes
1f516b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from PyPDF2 import PdfReader, PdfWriter
import pdfminer.high_level
import pdfminer.layout
from operator import itemgetter
import os
import pdftotext
from chemrxnextractor import RxnExtractor

class ChemRxnExtractor(object):
    def __init__(self, pdf, pn, model_dir, device):
        self.pdf_file = pdf
        self.pages = pn
        self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
        use_cuda = (device == 'cuda')
        self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
        self.text_file = "info.txt"
        self.pdf_text = ""
        if len(self.pdf_file) > 0:
            with open(self.pdf_file, "rb") as f:
                self.pdf_text = pdftotext.PDF(f)
        
    def set_pdf_file(self, pdf):
        self.pdf_file = pdf
        with open(self.pdf_file, "rb") as f:
            self.pdf_text = pdftotext.PDF(f)
    
    def set_pages(self, pn):
        self.pages = pn
    
    def set_model_dir(self, md):
        self.model_dir = md
        self.rxn_extractor = RxnExtractor(self.model_dir)
    
    def set_text_file(self, tf):
        self.text_file = tf
    
    def extract_reactions_from_text(self):
        if self.pages is None:
            return self.extract_all(len(self.pdf_text))
        else:
            return self.extract_all(self.pages)
    
    def extract_all(self, pages):
        ans = []
        text = self.get_paragraphs_from_pdf(pages)
        for data in text:
            L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
            reactions = self.get_reactions(L, page_number=data['page'])
            ans.append(reactions)
        return ans
    
    def get_reactions(self, sents, page_number=None):
        rxns = self.rxn_extractor.get_reactions(sents)
        
        ret = []
        for r in rxns:
            if len(r['reactions']) != 0: ret.append(r)
        ans = {}
        ans.update({'page' : page_number})
        ans.update({'reactions' : ret})
        return ans


    def get_paragraphs_from_pdf(self, pages):
        current_page_num = 1
        if pages is None:
            pages = len(self.pdf_text)
        result = []
        for page in range(pages):
            content = self.pdf_text[page]
            pg = content.split("\n\n")
            L = []
            for line in pg:
                paragraph = []
                if '\x0c' in line:
                    continue
                text = line
                text = text.replace("\n", " ")
                text = text.replace("- ", "-")
                curind = 0
                i = 0
                while i < len(text):
                    if text[i] == '.':
                        if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
                            paragraph.append(text[curind:i+1] + "\n")
                            while(i < len(text) and text[i] != " "):
                                i += 1
                            curind = i + 1
                    i += 1
                if curind != i:
                    if text[i - 1] == " ":
                        if i != 1:
                            i -= 1
                        else:
                            break
                    if text[i - 1] != '.':
                        paragraph.append(text[curind:i] + ".\n")
                    else:
                        paragraph.append(text[curind:i] + "\n")
                L.append(paragraph)

            result.append({
                'paragraphs': L,
                'page': current_page_num
            })
            current_page_num += 1
        return result