Jobey1 commited on
Commit
43a7a2a
·
verified ·
1 Parent(s): 69c287e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -17
app.py CHANGED
@@ -122,11 +122,43 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
122
  "content": content
123
  }
124
 
125
- # NEW: Function to clear file-related inputs/outputs only.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def clear_files():
127
- # Return empty values for file input and output display.
128
- # Notice that we do NOT return anything for the API key or repo address.
129
- return None, "", None
130
 
131
  # Gradio interface setup
132
  with gr.Blocks() as demo:
@@ -137,24 +169,20 @@ with gr.Blocks() as demo:
137
  pdf_file_input = gr.File(label="Upload PDF")
138
  convert_button = gr.Button("Convert to Parquet")
139
  clear_button = gr.Button("Clear Files")
140
- output_display = gr.Textbox(label="Output")
141
- # (Optional) A hidden textbox for parquet data, if used later.
142
- parquet_output = gr.Textbox(label="Parquet Data", visible=False)
143
-
144
  convert_button.click(
145
- extract_full_paper_with_labels,
146
- inputs=pdf_file_input,
147
- outputs=output_display
148
  )
149
- # The clear button now only clears file-related components;
150
- # API key and Repo Address remain untouched.
151
  clear_button.click(
152
  clear_files,
153
  inputs=None,
154
- outputs=[pdf_file_input, output_display, parquet_output]
155
  )
156
 
157
  demo.launch()
158
-
159
-
160
-
 
122
  "content": content
123
  }
124
 
125
+ def process_pdf_file(pdf_file, api_key, repo_address):
126
+ if pdf_file is None:
127
+ return None, "No PDF file uploaded."
128
+ # Extract content from PDF.
129
+ # pdf_file can be a file-like object or a dict depending on how Gradio returns it.
130
+ file_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file['name']
131
+ result = extract_full_paper_with_labels(file_path)
132
+
133
+ # Convert the result dictionary into a DataFrame and write it to a parquet file.
134
+ df = pd.DataFrame([result])
135
+ base = os.path.splitext(result['filename'])[0]
136
+ parquet_filename = f"{base}.parquet"
137
+ df.to_parquet(parquet_filename, index=False)
138
+
139
+ repo_status = ""
140
+ # If API key and repo address are provided, attempt to upload the parquet file.
141
+ if api_key and repo_address:
142
+ api = HfApi()
143
+ try:
144
+ api.upload_file(
145
+ path_or_fileobj=parquet_filename,
146
+ path_in_repo=parquet_filename,
147
+ repo_id=repo_address,
148
+ token=api_key
149
+ )
150
+ repo_status = f"File uploaded to repo {repo_address} successfully."
151
+ except Exception as e:
152
+ repo_status = f"Failed to upload to repo: {str(e)}"
153
+ else:
154
+ repo_status = "API key or repo address not provided, skipping repo upload."
155
+
156
+ # Return the parquet file for local download and the status message.
157
+ return parquet_filename, repo_status
158
+
159
+ # Function to clear only file-related inputs/outputs, preserving the API key and repo address.
160
  def clear_files():
161
+ return None, None, ""
 
 
162
 
163
  # Gradio interface setup
164
  with gr.Blocks() as demo:
 
169
  pdf_file_input = gr.File(label="Upload PDF")
170
  convert_button = gr.Button("Convert to Parquet")
171
  clear_button = gr.Button("Clear Files")
172
+ with gr.Row():
173
+ download_file_output = gr.File(label="Download Parquet File")
174
+ repo_status_output = gr.Textbox(label="Repo Upload Status")
175
+
176
  convert_button.click(
177
+ process_pdf_file,
178
+ inputs=[pdf_file_input, api_key_input, repo_address_input],
179
+ outputs=[download_file_output, repo_status_output]
180
  )
181
+ # The clear button now only clears file-related components; API key and Repo Address remain unchanged.
 
182
  clear_button.click(
183
  clear_files,
184
  inputs=None,
185
+ outputs=[pdf_file_input, download_file_output, repo_status_output]
186
  )
187
 
188
  demo.launch()