fsmoreir commited on
Commit
ea18622
·
verified ·
1 Parent(s): 4805dc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -56
app.py CHANGED
@@ -3,6 +3,8 @@ import subprocess
3
  import uuid
4
  import os
5
  import requests
 
 
6
 
7
  def get_pdf(pdf_link):
8
  # Generate a unique filename
@@ -20,6 +22,7 @@ def get_pdf(pdf_link):
20
  print("Failed to download the PDF.")
21
  return unique_filename
22
 
 
23
  def nougat_ocr(file_name):
24
  # Command to run
25
  cli_command = [
@@ -29,78 +32,52 @@ def nougat_ocr(file_name):
29
  '--checkpoint', 'nougat',
30
  '--markdown'
31
  ]
 
 
32
  subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
33
 
34
- def predict(pdf_file, pdf_link):
35
- if pdf_file is None and not pdf_link:
36
- return "No data provided. Upload a pdf file or provide a pdf link and try again!", "", ""
37
 
38
- if pdf_file is not None:
39
- file_name = pdf_file.name
 
 
 
 
 
 
 
40
  else:
41
- file_name = get_pdf(pdf_link)
 
 
 
42
 
43
  # Call nougat
44
  nougat_ocr(file_name)
45
 
46
  # Open the file for reading
47
  file_name = file_name.split('/')[-1][:-4]
48
- output_path = f'output/{file_name}.mmd'
49
- with open(output_path, 'r') as file:
50
  content = file.read()
51
- # Switch math delimiters
52
  content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
53
- return content, content, output_path
 
 
 
 
 
 
 
54
 
55
  css = """
56
  #mkd {
57
- height: 500px;
58
- overflow: auto;
59
- border: 1px solid #ccc;
60
  }
61
  """
62
 
63
- # JavaScript function to copy content to clipboard automatically
64
- js_auto_copy = """
65
- function autoCopyToClipboard() {
66
- var content = document.getElementById('markdown-content').value;
67
- navigator.clipboard.writeText(content).then(function() {
68
- console.log('Content copied to clipboard');
69
- }).catch(function(error) {
70
- console.error('Error copying content to clipboard: ', error);
71
- });
72
- }
73
- """
74
-
75
  with gr.Blocks(css=css) as demo:
76
- gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
77
- gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
78
-
79
- with gr.Row():
80
- gr.Markdown('<h4><center>Upload a PDF</center></h4>', scale=1)
81
- gr.Markdown('<h4><center><i>OR</i></center></h4>', scale=1)
82
- gr.Markdown('<h4><center>Provide a PDF link</center></h4>', scale=1)
83
-
84
- with gr.Row(equal_height=True):
85
- pdf_file = gr.File(label='PDF📃', file_count='single', scale=1)
86
- pdf_link = gr.Textbox(placeholder='Enter an Arxiv link here', label='PDF link🔗🌐', scale=1)
87
-
88
- with gr.Row():
89
- btn = gr.Button('Run NOUGAT🍫')
90
- clr = gr.Button('Clear🚿')
91
- dwn = gr.Button('Download📥')
92
-
93
- output_headline = gr.Markdown("<h3>PDF converted to markup language through Nougat-OCR👇:</h3>")
94
- parsed_output = gr.Markdown(elem_id='mkd', value='📃🔤OCR Output')
95
- markdown_hidden = gr.Textbox(elem_id='markdown-content', visible=False)
96
- download_link = gr.File(elem_id='download-link', visible=False)
97
-
98
- btn.click(fn=predict, inputs=[pdf_file, pdf_link], outputs=[parsed_output, markdown_hidden, download_link], _js=js_auto_copy)
99
- clr.click(lambda: (gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)), [], [pdf_file, pdf_link, parsed_output, markdown_hidden, download_link])
100
- dwn.click(None, [download_link], None)
101
-
102
- # Enable queueing for request handling
103
- demo.queue()
104
-
105
- # Launch the interface in debug mode
106
- demo.launch(debug=True)
 
3
  import uuid
4
  import os
5
  import requests
6
+ import re
7
+
8
 
9
  def get_pdf(pdf_link):
10
  # Generate a unique filename
 
22
  print("Failed to download the PDF.")
23
  return unique_filename
24
 
25
+
26
  def nougat_ocr(file_name):
27
  # Command to run
28
  cli_command = [
 
32
  '--checkpoint', 'nougat',
33
  '--markdown'
34
  ]
35
+
36
+ # Run the command and capture its output
37
  subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
38
 
 
 
 
39
 
40
+ def predict(pdf_file, pdf_link):
41
+ if pdf_file is None:
42
+ if pdf_link == '':
43
+ print("No file is uploaded and No link is provided")
44
+ return "No data provided. Upload a pdf file or provide a pdf link and try again!"
45
+ else:
46
+ print(f'pdf_link is - {pdf_link}')
47
+ file_name = get_pdf(pdf_link)
48
+ print(f'file_name is - {file_name}')
49
  else:
50
+ file_name = pdf_file.name
51
+ print(file_name)
52
+ pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
53
+ print(pdf_name)
54
 
55
  # Call nougat
56
  nougat_ocr(file_name)
57
 
58
  # Open the file for reading
59
  file_name = file_name.split('/')[-1][:-4]
60
+ mmd_file_path = f'output/{file_name}.mmd'
61
+ with open(mmd_file_path, 'r') as file:
62
  content = file.read()
63
+ # switch math delimiters
64
  content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
65
+
66
+ return content, mmd_file_path
67
+
68
+
69
+ def process_example(pdf_file, pdf_link):
70
+ ocr_content, _ = predict(pdf_file, pdf_link)
71
+ return gr.update(value=ocr_content)
72
+
73
 
74
  css = """
75
  #mkd {
76
+ height: 500px;
77
+ overflow: auto;
78
+ border: 1px solid #ccc;
79
  }
80
  """
81
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  with gr.Blocks(css=css) as demo:
83
+ gr.HTML("<h1><