Omar ID EL MOUMEN commited on
Commit
c2b2088
·
1 Parent(s): aea4c94

Update title extraction

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -90,8 +90,15 @@ async def extract_text_pdf(id_doc: str):
90
  postprocess_text = remove_in_betweens(postprocess_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
- titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) if len(doc.get_toc()) <= 0 else doc.get_toc()
94
- return {"message": titles, "pub_id": id_doc, "error": False}
 
 
 
 
 
 
 
95
  else:
96
  print("ID: " + id_doc)
97
  print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
 
90
  postprocess_text = remove_in_betweens(postprocess_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
+ titles = doc.get_toc()
94
+ main_titles = []
95
+ if len(titles) <= 0:
96
+ main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
97
+ else:
98
+ for title in titles:
99
+ if title[0] == 1:
100
+ main_titles.append(title[1])
101
+ return {"message": main_titles, "pub_id": id_doc, "error": False}
102
  else:
103
  print("ID: " + id_doc)
104
  print("URL: " + f"http://arxiv.org/pdf/{id_doc}")