Omar ID EL MOUMEN commited on
Commit
ca2c7e8
·
1 Parent(s): 9513d18

Fix extract

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -45,7 +45,9 @@ class Query(BaseModel):
45
  keyword: str
46
  limit: int
47
 
48
- # Put all GET into POST
 
 
49
  @app.post("/search")
50
  async def get_articles(query: Query):
51
  XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
@@ -72,8 +74,8 @@ async def get_articles(query: Query):
72
  return {"error": True, "message": str(e)}
73
 
74
  @app.post("/extract")
75
- async def extract_text_pdf(doc_id: str):
76
- pdf_req = requests.get(f"http://arxiv.org/pdf/{doc_id}", verify=False)
77
  if pdf_req.status_code == 200:
78
  pdf_data = BytesIO(pdf_req.content)
79
  doc = fitz.open(stream=pdf_data, filetype="pdf")
@@ -106,10 +108,10 @@ async def extract_text_pdf(doc_id: str):
106
  for title in titles:
107
  if title[0] == 1:
108
  main_titles.append(title[1])
109
- return {"pub_id": doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
110
  else:
111
- print("ID: " + doc_id)
112
- print("URL: " + f"http://arxiv.org/pdf/{doc_id}")
113
  print("Status code: " + str(pdf_req.status_code))
114
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
115
 
 
45
  keyword: str
46
  limit: int
47
 
48
+ class DocumentID(BaseModel):
49
+ doc_id: str
50
+
51
  @app.post("/search")
52
  async def get_articles(query: Query):
53
  XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
 
74
  return {"error": True, "message": str(e)}
75
 
76
  @app.post("/extract")
77
+ async def extract_text_pdf(document: DocumentID):
78
+ pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
79
  if pdf_req.status_code == 200:
80
  pdf_data = BytesIO(pdf_req.content)
81
  doc = fitz.open(stream=pdf_data, filetype="pdf")
 
108
  for title in titles:
109
  if title[0] == 1:
110
  main_titles.append(title[1])
111
+ return {"pub_id": document.doc_id, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
112
  else:
113
+ print("ID: " + document.doc_id)
114
+ print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
115
  print("Status code: " + str(pdf_req.status_code))
116
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
117