Spaces:
Sleeping
Sleeping
File size: 1,671 Bytes
2a910d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import os
import gradio as gr
import requests
import spaces
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
configure_logging()
MARKER_MODEL_LST = load_all_models()
@spaces.GPU
def extract_from_pdf(arxiv_id):
"""extract text from a PDF file"""
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
tmp_pdf = ".tmp_pdf"
response = requests.get(pdf_url)
if response.status_code == 200:
with open(tmp_pdf, "wb") as file:
file.write(response.content)
print("PDF downloaded and saved as ", tmp_pdf)
else:
print(f"Failed to download PDF. Status code: {response.status_code}")
full_text, doc_images, out_meta = convert_single_pdf(
tmp_pdf, MARKER_MODEL_LST, max_pages=20
)
os.remove(tmp_pdf)
print("Temporary PDF file removed.")
return full_text
def extract(arxiv_id):
if not arxiv_id:
return {"error": "ArXiv ID is required"}
try:
full_text = extract_from_pdf(arxiv_id)
results = {"arxiv_id": arxiv_id, "text": full_text}
return results
except Exception as e:
return {"error": str(e)}
with gr.Blocks() as app:
# Create an input text box
text_input = gr.Textbox(label="Enter arxiv id")
# Create an output text component
output = gr.JSON(label="Extracted text")
# When the input text is submitted, call the embedding function and display the output
text_input.submit(extract, inputs=text_input, outputs=output)
if __name__ == "__main__":
app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|