CYF200127 commited on
Commit
5d415f0
·
verified ·
1 Parent(s): 9b764c8

Update chemietoolkit/chemrxnextractor.py

Browse files
Files changed (1) hide show
  1. chemietoolkit/chemrxnextractor.py +0 -107
chemietoolkit/chemrxnextractor.py CHANGED
@@ -1,107 +0,0 @@
1
- from PyPDF2 import PdfReader, PdfWriter
2
- import pdfminer.high_level
3
- import pdfminer.layout
4
- from operator import itemgetter
5
- import os
6
- import pdftotext
7
- from chemrxnextractor import RxnExtractor
8
-
9
- class ChemRxnExtractor(object):
10
- def __init__(self, pdf, pn, model_dir, device):
11
- self.pdf_file = pdf
12
- self.pages = pn
13
- self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
14
- use_cuda = (device == 'cuda')
15
- self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
16
- self.text_file = "info.txt"
17
- self.pdf_text = ""
18
- if len(self.pdf_file) > 0:
19
- with open(self.pdf_file, "rb") as f:
20
- self.pdf_text = pdftotext.PDF(f)
21
-
22
- def set_pdf_file(self, pdf):
23
- self.pdf_file = pdf
24
- with open(self.pdf_file, "rb") as f:
25
- self.pdf_text = pdftotext.PDF(f)
26
-
27
- def set_pages(self, pn):
28
- self.pages = pn
29
-
30
- def set_model_dir(self, md):
31
- self.model_dir = md
32
- self.rxn_extractor = RxnExtractor(self.model_dir)
33
-
34
- def set_text_file(self, tf):
35
- self.text_file = tf
36
-
37
- def extract_reactions_from_text(self):
38
- if self.pages is None:
39
- return self.extract_all(len(self.pdf_text))
40
- else:
41
- return self.extract_all(self.pages)
42
-
43
- def extract_all(self, pages):
44
- ans = []
45
- text = self.get_paragraphs_from_pdf(pages)
46
- for data in text:
47
- L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
48
- reactions = self.get_reactions(L, page_number=data['page'])
49
- ans.append(reactions)
50
- return ans
51
-
52
- def get_reactions(self, sents, page_number=None):
53
- rxns = self.rxn_extractor.get_reactions(sents)
54
-
55
- ret = []
56
- for r in rxns:
57
- if len(r['reactions']) != 0: ret.append(r)
58
- ans = {}
59
- ans.update({'page' : page_number})
60
- ans.update({'reactions' : ret})
61
- return ans
62
-
63
-
64
- def get_paragraphs_from_pdf(self, pages):
65
- current_page_num = 1
66
- if pages is None:
67
- pages = len(self.pdf_text)
68
- result = []
69
- for page in range(pages):
70
- content = self.pdf_text[page]
71
- pg = content.split("\n\n")
72
- L = []
73
- for line in pg:
74
- paragraph = []
75
- if '\x0c' in line:
76
- continue
77
- text = line
78
- text = text.replace("\n", " ")
79
- text = text.replace("- ", "-")
80
- curind = 0
81
- i = 0
82
- while i < len(text):
83
- if text[i] == '.':
84
- if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
85
- paragraph.append(text[curind:i+1] + "\n")
86
- while(i < len(text) and text[i] != " "):
87
- i += 1
88
- curind = i + 1
89
- i += 1
90
- if curind != i:
91
- if text[i - 1] == " ":
92
- if i != 1:
93
- i -= 1
94
- else:
95
- break
96
- if text[i - 1] != '.':
97
- paragraph.append(text[curind:i] + ".\n")
98
- else:
99
- paragraph.append(text[curind:i] + "\n")
100
- L.append(paragraph)
101
-
102
- result.append({
103
- 'paragraphs': L,
104
- 'page': current_page_num
105
- })
106
- current_page_num += 1
107
- return result