fsmoreir commited on
Commit
acd7282
·
verified ·
1 Parent(s): ea18622

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -83
app.py CHANGED
@@ -1,83 +1,42 @@
1
- import gradio as gr
2
- import subprocess
3
- import uuid
4
- import os
5
- import requests
6
- import re
7
-
8
-
9
- def get_pdf(pdf_link):
10
- # Generate a unique filename
11
- unique_filename = f"input/downloaded_paper_{uuid.uuid4().hex}.pdf"
12
-
13
- # Send a GET request to the PDF link
14
- response = requests.get(pdf_link)
15
-
16
- if response.status_code == 200:
17
- # Save the PDF content to a local file
18
- with open(unique_filename, 'wb') as pdf_file:
19
- pdf_file.write(response.content)
20
- print("PDF downloaded successfully.")
21
- else:
22
- print("Failed to download the PDF.")
23
- return unique_filename
24
-
25
-
26
- def nougat_ocr(file_name):
27
- # Command to run
28
- cli_command = [
29
- 'nougat',
30
- '--out', 'output',
31
- 'pdf', f'{file_name}',
32
- '--checkpoint', 'nougat',
33
- '--markdown'
34
- ]
35
-
36
- # Run the command and capture its output
37
- subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
38
-
39
-
40
- def predict(pdf_file, pdf_link):
41
- if pdf_file is None:
42
- if pdf_link == '':
43
- print("No file is uploaded and No link is provided")
44
- return "No data provided. Upload a pdf file or provide a pdf link and try again!"
45
- else:
46
- print(f'pdf_link is - {pdf_link}')
47
- file_name = get_pdf(pdf_link)
48
- print(f'file_name is - {file_name}')
49
- else:
50
- file_name = pdf_file.name
51
- print(file_name)
52
- pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
53
- print(pdf_name)
54
-
55
- # Call nougat
56
- nougat_ocr(file_name)
57
-
58
- # Open the file for reading
59
- file_name = file_name.split('/')[-1][:-4]
60
- mmd_file_path = f'output/{file_name}.mmd'
61
- with open(mmd_file_path, 'r') as file:
62
- content = file.read()
63
- # switch math delimiters
64
- content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
65
-
66
- return content, mmd_file_path
67
-
68
-
69
- def process_example(pdf_file, pdf_link):
70
- ocr_content, _ = predict(pdf_file, pdf_link)
71
- return gr.update(value=ocr_content)
72
-
73
-
74
- css = """
75
- #mkd {
76
- height: 500px;
77
- overflow: auto;
78
- border: 1px solid #ccc;
79
- }
80
- """
81
-
82
- with gr.Blocks(css=css) as demo:
83
- gr.HTML("<h1><
 
1
+ gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
2
+ gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
3
+
4
+ with gr.Row():
5
+ mkd = gr.Markdown('<h4><center>Upload a PDF</center></h4>', scale=1)
6
+ mkd = gr.Markdown('<h4><center><i>OR</i></center></h4>', scale=1)
7
+ mkd = gr.Markdown('<h4><center>Provide a PDF link</center></h4>', scale=1)
8
+
9
+ with gr.Row(equal_height=True):
10
+ pdf_file = gr.File(label='PDF📃', file_count='single', scale=1)
11
+ pdf_link = gr.Textbox(placeholder='Enter an Arxiv link here', label='PDF link🔗🌐', scale=1)
12
+
13
+ with gr.Row():
14
+ btn = gr.Button('Run NOUGAT🍫')
15
+ clr = gr.Button('Clear🚿')
16
+
17
+ output_headline = gr.Markdown("<h3>PDF converted to markup language through Nougat-OCR👇:</h3>")
18
+ parsed_output = gr.Markdown(elem_id='mkd', value='📃🔤OCR Output')
19
+ mmd_file_download = gr.File(label='Download .mmd file', interactive=False)
20
+
21
+ def handle_predict(pdf_file, pdf_link):
22
+ content, mmd_file_path = predict(pdf_file, pdf_link)
23
+ return gr.update(value=content), mmd_file_path
24
+
25
+ btn.click(handle_predict, [pdf_file, pdf_link], [parsed_output, mmd_file_download])
26
+ clr.click(lambda: (gr.update(value=None),
27
+ gr.update(value=None),
28
+ gr.update(value=None)),
29
+ [],
30
+ [pdf_file, pdf_link, parsed_output, mmd_file_download])
31
+
32
+ gr.Examples(
33
+ [["input/nougat.pdf", ""], [None, "https://arxiv.org/pdf/2308.08316.pdf"]],
34
+ inputs=[pdf_file, pdf_link],
35
+ outputs=parsed_output,
36
+ fn=process_example,
37
+ cache_examples=True,
38
+ label='Click on any Examples below to get Nougat OCR results quickly:'
39
+ )
40
+
41
+ demo.queue()
42
+ demo.launch(debug=True)