Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
ca2c7e8
1
Parent(s):
9513d18
Fix extract
Browse files
app.py
CHANGED
@@ -45,7 +45,9 @@ class Query(BaseModel):
|
|
45 |
keyword: str
|
46 |
limit: int
|
47 |
|
48 |
-
|
|
|
|
|
49 |
@app.post("/search")
|
50 |
async def get_articles(query: Query):
|
51 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
@@ -72,8 +74,8 @@ async def get_articles(query: Query):
|
|
72 |
return {"error": True, "message": str(e)}
|
73 |
|
74 |
@app.post("/extract")
|
75 |
-
async def extract_text_pdf(
|
76 |
-
pdf_req = requests.get(f"http://arxiv.org/pdf/{doc_id}", verify=False)
|
77 |
if pdf_req.status_code == 200:
|
78 |
pdf_data = BytesIO(pdf_req.content)
|
79 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
@@ -106,10 +108,10 @@ async def extract_text_pdf(doc_id: str):
|
|
106 |
for title in titles:
|
107 |
if title[0] == 1:
|
108 |
main_titles.append(title[1])
|
109 |
-
return {"pub_id": doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
110 |
else:
|
111 |
-
print("ID: " + doc_id)
|
112 |
-
print("URL: " + f"http://arxiv.org/pdf/{doc_id}")
|
113 |
print("Status code: " + str(pdf_req.status_code))
|
114 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
115 |
|
|
|
45 |
keyword: str
|
46 |
limit: int
|
47 |
|
48 |
+
class DocumentID(BaseModel):
|
49 |
+
doc_id: str
|
50 |
+
|
51 |
@app.post("/search")
|
52 |
async def get_articles(query: Query):
|
53 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
|
74 |
return {"error": True, "message": str(e)}
|
75 |
|
76 |
@app.post("/extract")
|
77 |
+
async def extract_text_pdf(document: DocumentID):
|
78 |
+
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
79 |
if pdf_req.status_code == 200:
|
80 |
pdf_data = BytesIO(pdf_req.content)
|
81 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
108 |
for title in titles:
|
109 |
if title[0] == 1:
|
110 |
main_titles.append(title[1])
|
111 |
+
return {"pub_id": document.doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
112 |
else:
|
113 |
+
print("ID: " + document.doc_id)
|
114 |
+
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
|
115 |
print("Status code: " + str(pdf_req.status_code))
|
116 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
117 |
|