Spaces:
Sleeping
Sleeping
Update chemietoolkit/chemrxnextractor.py
Browse files
chemietoolkit/chemrxnextractor.py
CHANGED
@@ -1,107 +0,0 @@
|
|
1 |
-
from PyPDF2 import PdfReader, PdfWriter
|
2 |
-
import pdfminer.high_level
|
3 |
-
import pdfminer.layout
|
4 |
-
from operator import itemgetter
|
5 |
-
import os
|
6 |
-
import pdftotext
|
7 |
-
from chemrxnextractor import RxnExtractor
|
8 |
-
|
9 |
-
class ChemRxnExtractor(object):
|
10 |
-
def __init__(self, pdf, pn, model_dir, device):
|
11 |
-
self.pdf_file = pdf
|
12 |
-
self.pages = pn
|
13 |
-
self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
|
14 |
-
use_cuda = (device == 'cuda')
|
15 |
-
self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
|
16 |
-
self.text_file = "info.txt"
|
17 |
-
self.pdf_text = ""
|
18 |
-
if len(self.pdf_file) > 0:
|
19 |
-
with open(self.pdf_file, "rb") as f:
|
20 |
-
self.pdf_text = pdftotext.PDF(f)
|
21 |
-
|
22 |
-
def set_pdf_file(self, pdf):
|
23 |
-
self.pdf_file = pdf
|
24 |
-
with open(self.pdf_file, "rb") as f:
|
25 |
-
self.pdf_text = pdftotext.PDF(f)
|
26 |
-
|
27 |
-
def set_pages(self, pn):
|
28 |
-
self.pages = pn
|
29 |
-
|
30 |
-
def set_model_dir(self, md):
|
31 |
-
self.model_dir = md
|
32 |
-
self.rxn_extractor = RxnExtractor(self.model_dir)
|
33 |
-
|
34 |
-
def set_text_file(self, tf):
|
35 |
-
self.text_file = tf
|
36 |
-
|
37 |
-
def extract_reactions_from_text(self):
|
38 |
-
if self.pages is None:
|
39 |
-
return self.extract_all(len(self.pdf_text))
|
40 |
-
else:
|
41 |
-
return self.extract_all(self.pages)
|
42 |
-
|
43 |
-
def extract_all(self, pages):
|
44 |
-
ans = []
|
45 |
-
text = self.get_paragraphs_from_pdf(pages)
|
46 |
-
for data in text:
|
47 |
-
L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
|
48 |
-
reactions = self.get_reactions(L, page_number=data['page'])
|
49 |
-
ans.append(reactions)
|
50 |
-
return ans
|
51 |
-
|
52 |
-
def get_reactions(self, sents, page_number=None):
|
53 |
-
rxns = self.rxn_extractor.get_reactions(sents)
|
54 |
-
|
55 |
-
ret = []
|
56 |
-
for r in rxns:
|
57 |
-
if len(r['reactions']) != 0: ret.append(r)
|
58 |
-
ans = {}
|
59 |
-
ans.update({'page' : page_number})
|
60 |
-
ans.update({'reactions' : ret})
|
61 |
-
return ans
|
62 |
-
|
63 |
-
|
64 |
-
def get_paragraphs_from_pdf(self, pages):
|
65 |
-
current_page_num = 1
|
66 |
-
if pages is None:
|
67 |
-
pages = len(self.pdf_text)
|
68 |
-
result = []
|
69 |
-
for page in range(pages):
|
70 |
-
content = self.pdf_text[page]
|
71 |
-
pg = content.split("\n\n")
|
72 |
-
L = []
|
73 |
-
for line in pg:
|
74 |
-
paragraph = []
|
75 |
-
if '\x0c' in line:
|
76 |
-
continue
|
77 |
-
text = line
|
78 |
-
text = text.replace("\n", " ")
|
79 |
-
text = text.replace("- ", "-")
|
80 |
-
curind = 0
|
81 |
-
i = 0
|
82 |
-
while i < len(text):
|
83 |
-
if text[i] == '.':
|
84 |
-
if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
|
85 |
-
paragraph.append(text[curind:i+1] + "\n")
|
86 |
-
while(i < len(text) and text[i] != " "):
|
87 |
-
i += 1
|
88 |
-
curind = i + 1
|
89 |
-
i += 1
|
90 |
-
if curind != i:
|
91 |
-
if text[i - 1] == " ":
|
92 |
-
if i != 1:
|
93 |
-
i -= 1
|
94 |
-
else:
|
95 |
-
break
|
96 |
-
if text[i - 1] != '.':
|
97 |
-
paragraph.append(text[curind:i] + ".\n")
|
98 |
-
else:
|
99 |
-
paragraph.append(text[curind:i] + "\n")
|
100 |
-
L.append(paragraph)
|
101 |
-
|
102 |
-
result.append({
|
103 |
-
'paragraphs': L,
|
104 |
-
'page': current_page_num
|
105 |
-
})
|
106 |
-
current_page_num += 1
|
107 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|