Spaces:
Sleeping
Sleeping
Update chemietoolkit/interface.py
Browse files- chemietoolkit/interface.py +2 -101
chemietoolkit/interface.py
CHANGED
@@ -8,7 +8,6 @@ from huggingface_hub import hf_hub_download, snapshot_download
|
|
8 |
from molscribe import MolScribe
|
9 |
from rxnscribe import RxnScribe, MolDetect
|
10 |
from chemiener import ChemNER
|
11 |
-
from .chemrxnextractor import ChemRxnExtractor
|
12 |
from .tableextractor import TableExtractor
|
13 |
from .utils import *
|
14 |
|
@@ -23,7 +22,6 @@ class ChemIEToolkit:
|
|
23 |
self._rxnscribe = None
|
24 |
self._pdfparser = None
|
25 |
self._moldet = None
|
26 |
-
self._chemrxnextractor = None
|
27 |
self._chemner = None
|
28 |
self._coref = None
|
29 |
|
@@ -116,22 +114,8 @@ class ChemIEToolkit:
|
|
116 |
self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
|
117 |
|
118 |
|
119 |
-
@property
|
120 |
-
def chemrxnextractor(self):
|
121 |
-
if self._chemrxnextractor is None:
|
122 |
-
self.init_chemrxnextractor()
|
123 |
-
return self._chemrxnextractor
|
124 |
|
125 |
-
|
126 |
-
def init_chemrxnextractor(self, ckpt_path=None):
|
127 |
-
"""
|
128 |
-
Set model to custom checkpoint
|
129 |
-
Parameters:
|
130 |
-
ckpt_path: path to checkpoint to use, if None then will use default
|
131 |
-
"""
|
132 |
-
if ckpt_path is None:
|
133 |
-
ckpt_path = snapshot_download(repo_id="amberwang/chemrxnextractor-training-modules")
|
134 |
-
self._chemrxnextractor = ChemRxnExtractor("", None, ckpt_path, self.device.type)
|
135 |
|
136 |
|
137 |
@property
|
@@ -505,85 +489,7 @@ class ChemIEToolkit:
|
|
505 |
results.append(data)
|
506 |
return results
|
507 |
|
508 |
-
def extract_molecules_from_text_in_pdf(self, pdf, batch_size=16, num_pages=None):
|
509 |
-
"""
|
510 |
-
Get molecules in text of given pdf
|
511 |
-
|
512 |
-
Parameters:
|
513 |
-
pdf: path to pdf, or byte file
|
514 |
-
batch_size: batch size for inference in all models
|
515 |
-
num_pages: process only first `num_pages` pages, if `None` then process all
|
516 |
-
Returns:
|
517 |
-
list of sentences and found molecules in the following format
|
518 |
-
[
|
519 |
-
{
|
520 |
-
'molecules': [
|
521 |
-
{ # first paragraph
|
522 |
-
'text': str,
|
523 |
-
'labels': [
|
524 |
-
(str, int, int), # tuple of label, range start (inclusive), range end (exclusive)
|
525 |
-
# more labels
|
526 |
-
]
|
527 |
-
},
|
528 |
-
# more paragraphs
|
529 |
-
]
|
530 |
-
'page': int
|
531 |
-
},
|
532 |
-
# more pages
|
533 |
-
]
|
534 |
-
"""
|
535 |
-
self.chemrxnextractor.set_pdf_file(pdf)
|
536 |
-
self.chemrxnextractor.set_pages(num_pages)
|
537 |
-
text = self.chemrxnextractor.get_paragraphs_from_pdf(num_pages)
|
538 |
-
result = []
|
539 |
-
for data in text:
|
540 |
-
model_inp = []
|
541 |
-
for paragraph in data['paragraphs']:
|
542 |
-
model_inp.append(' '.join(paragraph).replace('\n', ''))
|
543 |
-
output = self.chemner.predict_strings(model_inp, batch_size=batch_size)
|
544 |
-
to_add = {
|
545 |
-
'molecules': [{
|
546 |
-
'text': t,
|
547 |
-
'labels': labels,
|
548 |
-
} for t, labels in zip(model_inp, output)],
|
549 |
-
'page': data['page']
|
550 |
-
}
|
551 |
-
result.append(to_add)
|
552 |
-
return result
|
553 |
-
|
554 |
|
555 |
-
def extract_reactions_from_text_in_pdf(self, pdf, num_pages=None):
|
556 |
-
"""
|
557 |
-
Get reaction information from text in pdf
|
558 |
-
Parameters:
|
559 |
-
pdf: path to pdf
|
560 |
-
num_pages: process only first `num_pages` pages, if `None` then process all
|
561 |
-
Returns:
|
562 |
-
list of pages and corresponding reaction info in the following format
|
563 |
-
[
|
564 |
-
{
|
565 |
-
'page': page number
|
566 |
-
'reactions': [
|
567 |
-
{
|
568 |
-
'tokens': list of words in relevant sentence,
|
569 |
-
'reactions' : [
|
570 |
-
{
|
571 |
-
# key, value pairs where key is the label and value is a tuple
|
572 |
-
# or list of tuples of the form (tokens, start index, end index)
|
573 |
-
# where indices are for the corresponding token list and start and end are inclusive
|
574 |
-
}
|
575 |
-
# more reactions
|
576 |
-
]
|
577 |
-
}
|
578 |
-
# more reactions in other sentences
|
579 |
-
]
|
580 |
-
},
|
581 |
-
# more pages
|
582 |
-
]
|
583 |
-
"""
|
584 |
-
self.chemrxnextractor.set_pdf_file(pdf)
|
585 |
-
self.chemrxnextractor.set_pages(num_pages)
|
586 |
-
return self.chemrxnextractor.extract_reactions_from_text()
|
587 |
|
588 |
def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
|
589 |
"""
|
@@ -735,15 +641,10 @@ class ChemIEToolkit:
|
|
735 |
images = [figure['figure']['image'] for figure in figures]
|
736 |
results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
|
737 |
table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
|
738 |
-
text_results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
|
739 |
results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
|
740 |
figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
|
741 |
table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
|
742 |
-
|
743 |
return {
|
744 |
'figures': table_expanded_results,
|
745 |
-
'text': coref_expanded_results,
|
746 |
}
|
747 |
-
|
748 |
-
if __name__=="__main__":
|
749 |
-
model = OpenChemIE()
|
|
|
8 |
from molscribe import MolScribe
|
9 |
from rxnscribe import RxnScribe, MolDetect
|
10 |
from chemiener import ChemNER
|
|
|
11 |
from .tableextractor import TableExtractor
|
12 |
from .utils import *
|
13 |
|
|
|
22 |
self._rxnscribe = None
|
23 |
self._pdfparser = None
|
24 |
self._moldet = None
|
|
|
25 |
self._chemner = None
|
26 |
self._coref = None
|
27 |
|
|
|
114 |
self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
|
115 |
|
116 |
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
|
121 |
@property
|
|
|
489 |
results.append(data)
|
490 |
return results
|
491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
|
494 |
def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
|
495 |
"""
|
|
|
641 |
images = [figure['figure']['image'] for figure in figures]
|
642 |
results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
|
643 |
table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
|
|
|
644 |
results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
|
645 |
figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
|
646 |
table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
|
647 |
+
|
648 |
return {
|
649 |
'figures': table_expanded_results,
|
|
|
650 |
}
|
|
|
|
|
|