Spaces:
Sleeping
Sleeping
File size: 3,693 Bytes
1f516b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from PyPDF2 import PdfReader, PdfWriter
import pdfminer.high_level
import pdfminer.layout
from operator import itemgetter
import os
import pdftotext
from chemrxnextractor import RxnExtractor
class ChemRxnExtractor(object):
def __init__(self, pdf, pn, model_dir, device):
self.pdf_file = pdf
self.pages = pn
self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
use_cuda = (device == 'cuda')
self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
self.text_file = "info.txt"
self.pdf_text = ""
if len(self.pdf_file) > 0:
with open(self.pdf_file, "rb") as f:
self.pdf_text = pdftotext.PDF(f)
def set_pdf_file(self, pdf):
self.pdf_file = pdf
with open(self.pdf_file, "rb") as f:
self.pdf_text = pdftotext.PDF(f)
def set_pages(self, pn):
self.pages = pn
def set_model_dir(self, md):
self.model_dir = md
self.rxn_extractor = RxnExtractor(self.model_dir)
def set_text_file(self, tf):
self.text_file = tf
def extract_reactions_from_text(self):
if self.pages is None:
return self.extract_all(len(self.pdf_text))
else:
return self.extract_all(self.pages)
def extract_all(self, pages):
ans = []
text = self.get_paragraphs_from_pdf(pages)
for data in text:
L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
reactions = self.get_reactions(L, page_number=data['page'])
ans.append(reactions)
return ans
def get_reactions(self, sents, page_number=None):
rxns = self.rxn_extractor.get_reactions(sents)
ret = []
for r in rxns:
if len(r['reactions']) != 0: ret.append(r)
ans = {}
ans.update({'page' : page_number})
ans.update({'reactions' : ret})
return ans
def get_paragraphs_from_pdf(self, pages):
current_page_num = 1
if pages is None:
pages = len(self.pdf_text)
result = []
for page in range(pages):
content = self.pdf_text[page]
pg = content.split("\n\n")
L = []
for line in pg:
paragraph = []
if '\x0c' in line:
continue
text = line
text = text.replace("\n", " ")
text = text.replace("- ", "-")
curind = 0
i = 0
while i < len(text):
if text[i] == '.':
if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
paragraph.append(text[curind:i+1] + "\n")
while(i < len(text) and text[i] != " "):
i += 1
curind = i + 1
i += 1
if curind != i:
if text[i - 1] == " ":
if i != 1:
i -= 1
else:
break
if text[i - 1] != '.':
paragraph.append(text[curind:i] + ".\n")
else:
paragraph.append(text[curind:i] + "\n")
L.append(paragraph)
result.append({
'paragraphs': L,
'page': current_page_num
})
current_page_num += 1
return result
|