Spaces:

OrganizedProgrammers
/

arXiv

Sleeping

Omar ID EL MOUMEN commited on Mar 26

Commit

ca2c7e8

1 Parent(s): 9513d18

Fix extract

Files changed (1) hide show

app.py CHANGED Viewed

@@ -45,7 +45,9 @@ class Query(BaseModel):
     keyword: str
     limit: int
-# Put all GET into POST
 @app.post("/search")
 async def get_articles(query: Query):
     XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
@@ -72,8 +74,8 @@ async def get_articles(query: Query):
         return {"error": True, "message": str(e)}
 @app.post("/extract")
-async def extract_text_pdf(doc_id: str):
-    pdf_req = requests.get(f"http://arxiv.org/pdf/{doc_id}", verify=False)
     if pdf_req.status_code == 200:
         pdf_data = BytesIO(pdf_req.content)
         doc = fitz.open(stream=pdf_data, filetype="pdf")
@@ -106,10 +108,10 @@ async def extract_text_pdf(doc_id: str):
             for title in titles:
                 if title[0] == 1:
                     main_titles.append(title[1])
-        return {"pub_id": doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
     else:
-        print("ID: " + doc_id)
-        print("URL: " + f"http://arxiv.org/pdf/{doc_id}")
         print("Status code: " + str(pdf_req.status_code))
         return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}

     keyword: str
     limit: int
+class DocumentID(BaseModel):
+    doc_id: str
 @app.post("/search")
 async def get_articles(query: Query):
     XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
         return {"error": True, "message": str(e)}
 @app.post("/extract")
+async def extract_text_pdf(document: DocumentID):
+    pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
     if pdf_req.status_code == 200:
         pdf_data = BytesIO(pdf_req.content)
         doc = fitz.open(stream=pdf_data, filetype="pdf")
             for title in titles:
                 if title[0] == 1:
                     main_titles.append(title[1])
+        return {"pub_id": document.doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
     else:
+        print("ID: " + document.doc_id)
+        print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
         print("Status code: " + str(pdf_req.status_code))
         return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}