CYF200127 commited on
Commit
565cbbc
·
verified ·
1 Parent(s): 5d415f0

Update chemietoolkit/interface.py

Browse files
Files changed (1) hide show
  1. chemietoolkit/interface.py +2 -101
chemietoolkit/interface.py CHANGED
@@ -8,7 +8,6 @@ from huggingface_hub import hf_hub_download, snapshot_download
8
  from molscribe import MolScribe
9
  from rxnscribe import RxnScribe, MolDetect
10
  from chemiener import ChemNER
11
- from .chemrxnextractor import ChemRxnExtractor
12
  from .tableextractor import TableExtractor
13
  from .utils import *
14
 
@@ -23,7 +22,6 @@ class ChemIEToolkit:
23
  self._rxnscribe = None
24
  self._pdfparser = None
25
  self._moldet = None
26
- self._chemrxnextractor = None
27
  self._chemner = None
28
  self._coref = None
29
 
@@ -116,22 +114,8 @@ class ChemIEToolkit:
116
  self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
117
 
118
 
119
- @property
120
- def chemrxnextractor(self):
121
- if self._chemrxnextractor is None:
122
- self.init_chemrxnextractor()
123
- return self._chemrxnextractor
124
 
125
- @lru_cache(maxsize=None)
126
- def init_chemrxnextractor(self, ckpt_path=None):
127
- """
128
- Set model to custom checkpoint
129
- Parameters:
130
- ckpt_path: path to checkpoint to use, if None then will use default
131
- """
132
- if ckpt_path is None:
133
- ckpt_path = snapshot_download(repo_id="amberwang/chemrxnextractor-training-modules")
134
- self._chemrxnextractor = ChemRxnExtractor("", None, ckpt_path, self.device.type)
135
 
136
 
137
  @property
@@ -505,85 +489,7 @@ class ChemIEToolkit:
505
  results.append(data)
506
  return results
507
 
508
- def extract_molecules_from_text_in_pdf(self, pdf, batch_size=16, num_pages=None):
509
- """
510
- Get molecules in text of given pdf
511
-
512
- Parameters:
513
- pdf: path to pdf, or byte file
514
- batch_size: batch size for inference in all models
515
- num_pages: process only first `num_pages` pages, if `None` then process all
516
- Returns:
517
- list of sentences and found molecules in the following format
518
- [
519
- {
520
- 'molecules': [
521
- { # first paragraph
522
- 'text': str,
523
- 'labels': [
524
- (str, int, int), # tuple of label, range start (inclusive), range end (exclusive)
525
- # more labels
526
- ]
527
- },
528
- # more paragraphs
529
- ]
530
- 'page': int
531
- },
532
- # more pages
533
- ]
534
- """
535
- self.chemrxnextractor.set_pdf_file(pdf)
536
- self.chemrxnextractor.set_pages(num_pages)
537
- text = self.chemrxnextractor.get_paragraphs_from_pdf(num_pages)
538
- result = []
539
- for data in text:
540
- model_inp = []
541
- for paragraph in data['paragraphs']:
542
- model_inp.append(' '.join(paragraph).replace('\n', ''))
543
- output = self.chemner.predict_strings(model_inp, batch_size=batch_size)
544
- to_add = {
545
- 'molecules': [{
546
- 'text': t,
547
- 'labels': labels,
548
- } for t, labels in zip(model_inp, output)],
549
- 'page': data['page']
550
- }
551
- result.append(to_add)
552
- return result
553
-
554
 
555
- def extract_reactions_from_text_in_pdf(self, pdf, num_pages=None):
556
- """
557
- Get reaction information from text in pdf
558
- Parameters:
559
- pdf: path to pdf
560
- num_pages: process only first `num_pages` pages, if `None` then process all
561
- Returns:
562
- list of pages and corresponding reaction info in the following format
563
- [
564
- {
565
- 'page': page number
566
- 'reactions': [
567
- {
568
- 'tokens': list of words in relevant sentence,
569
- 'reactions' : [
570
- {
571
- # key, value pairs where key is the label and value is a tuple
572
- # or list of tuples of the form (tokens, start index, end index)
573
- # where indices are for the corresponding token list and start and end are inclusive
574
- }
575
- # more reactions
576
- ]
577
- }
578
- # more reactions in other sentences
579
- ]
580
- },
581
- # more pages
582
- ]
583
- """
584
- self.chemrxnextractor.set_pdf_file(pdf)
585
- self.chemrxnextractor.set_pages(num_pages)
586
- return self.chemrxnextractor.extract_reactions_from_text()
587
 
588
  def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
589
  """
@@ -735,15 +641,10 @@ class ChemIEToolkit:
735
  images = [figure['figure']['image'] for figure in figures]
736
  results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
737
  table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
738
- text_results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
739
  results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
740
  figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
741
  table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
742
- coref_expanded_results = associate_corefs(text_results, results_coref)
743
  return {
744
  'figures': table_expanded_results,
745
- 'text': coref_expanded_results,
746
  }
747
-
748
- if __name__=="__main__":
749
- model = OpenChemIE()
 
8
  from molscribe import MolScribe
9
  from rxnscribe import RxnScribe, MolDetect
10
  from chemiener import ChemNER
 
11
  from .tableextractor import TableExtractor
12
  from .utils import *
13
 
 
22
  self._rxnscribe = None
23
  self._pdfparser = None
24
  self._moldet = None
 
25
  self._chemner = None
26
  self._coref = None
27
 
 
114
  self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
115
 
116
 
 
 
 
 
 
117
 
118
+
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  @property
 
489
  results.append(data)
490
  return results
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
  def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
495
  """
 
641
  images = [figure['figure']['image'] for figure in figures]
642
  results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
643
  table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
 
644
  results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
645
  figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
646
  table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
647
+
648
  return {
649
  'figures': table_expanded_results,
 
650
  }