import os import gradio as gr import requests import spaces from marker.convert import convert_single_pdf from marker.logger import configure_logging from marker.models import load_all_models configure_logging() MARKER_MODEL_LST = load_all_models() @spaces.GPU def extract_from_pdf(arxiv_id): """extract text from a PDF file""" pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" tmp_pdf = ".tmp_pdf" response = requests.get(pdf_url) if response.status_code == 200: with open(tmp_pdf, "wb") as file: file.write(response.content) print("PDF downloaded and saved as ", tmp_pdf) else: print(f"Failed to download PDF. Status code: {response.status_code}") full_text, doc_images, out_meta = convert_single_pdf( tmp_pdf, MARKER_MODEL_LST, max_pages=20 ) os.remove(tmp_pdf) print("Temporary PDF file removed.") return full_text def extract(arxiv_id): if not arxiv_id: return {"error": "ArXiv ID is required"} try: full_text = extract_from_pdf(arxiv_id) results = {"arxiv_id": arxiv_id, "text": full_text} return results except Exception as e: return {"error": str(e)} with gr.Blocks() as app: # Create an input text box text_input = gr.Textbox(label="Enter arxiv id") # Create an output text component output = gr.JSON(label="Extracted text") # When the input text is submitted, call the embedding function and display the output text_input.submit(extract, inputs=text_input, outputs=output) if __name__ == "__main__": app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)