Spaces:

rbiswasfc
/

arxiv-extract-from-pdf

Running

File size: 1,671 Bytes

2a910d7

import os

import gradio as gr
import requests
import spaces
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models

configure_logging()
MARKER_MODEL_LST = load_all_models()


@spaces.GPU
def extract_from_pdf(arxiv_id):
    """extract text from a PDF file"""
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    tmp_pdf = ".tmp_pdf"
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(tmp_pdf, "wb") as file:
            file.write(response.content)
        print("PDF downloaded and saved as ", tmp_pdf)
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")
    full_text, doc_images, out_meta = convert_single_pdf(
        tmp_pdf, MARKER_MODEL_LST, max_pages=20
    )

    os.remove(tmp_pdf)
    print("Temporary PDF file removed.")

    return full_text


def extract(arxiv_id):
    if not arxiv_id:
        return {"error": "ArXiv ID is required"}

    try:
        full_text = extract_from_pdf(arxiv_id)
        results = {"arxiv_id": arxiv_id, "text": full_text}
        return results

    except Exception as e:
        return {"error": str(e)}


with gr.Blocks() as app:
    # Create an input text box
    text_input = gr.Textbox(label="Enter arxiv id")

    # Create an output text component
    output = gr.JSON(label="Extracted text")

    # When the input text is submitted, call the embedding function and display the output
    text_input.submit(extract, inputs=text_input, outputs=output)


if __name__ == "__main__":
    app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)