Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
c2b2088
1
Parent(s):
aea4c94
Update title extraction
Browse files
app.py
CHANGED
@@ -90,8 +90,15 @@ async def extract_text_pdf(id_doc: str):
|
|
90 |
postprocess_text = remove_in_betweens(postprocess_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
93 |
-
titles =
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
else:
|
96 |
print("ID: " + id_doc)
|
97 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
|
|
90 |
postprocess_text = remove_in_betweens(postprocess_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
93 |
+
titles = doc.get_toc()
|
94 |
+
main_titles = []
|
95 |
+
if len(titles) <= 0:
|
96 |
+
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
|
97 |
+
else:
|
98 |
+
for title in titles:
|
99 |
+
if title[0] == 1:
|
100 |
+
main_titles.append(title[1])
|
101 |
+
return {"message": main_titles, "pub_id": id_doc, "error": False}
|
102 |
else:
|
103 |
print("ID: " + id_doc)
|
104 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|