Jobey1 commited on
Commit
9bea774
Β·
verified Β·
1 Parent(s): b891f10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -48
app.py CHANGED
@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
- print(f"πŸ“ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
  content = ""
14
 
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
31
- code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|}|;)"
32
  reference_keywords = ['reference', 'bibliography', 'sources']
33
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
34
 
@@ -122,65 +122,131 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
122
  "content": content
123
  }
124
 
125
- def process_pdf_file(pdf_file, api_key, repo_address):
126
- if pdf_file is None:
127
- return None, "No PDF file uploaded."
128
-
129
- # Determine file path (Gradio returns a file object or dict)
130
- file_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file['name']
131
- result = extract_full_paper_with_labels(file_path)
132
-
133
- # Convert the result dictionary to a DataFrame and write it as a parquet file.
134
- df = pd.DataFrame([result])
135
- base = os.path.splitext(result['filename'])[0]
136
- parquet_filename = f"{base}.parquet"
137
- df.to_parquet(parquet_filename, index=False)
138
-
139
- repo_status = ""
140
- if api_key and repo_address:
141
- api = HfApi()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  try:
143
- api.upload_file(
144
- path_or_fileobj=parquet_filename,
145
- path_in_repo=parquet_filename,
146
- repo_id=repo_address,
147
- token=api_key
148
- )
149
- repo_status = f"File uploaded to repo {repo_address} successfully."
150
  except Exception as e:
151
- repo_status = f"Failed to upload to repo: {str(e)}"
152
- else:
153
- repo_status = "API key or repo address not provided, skipping repo upload."
154
-
155
- return parquet_filename, repo_status
156
 
157
- # Clear only file-related inputs/outputs, preserving API key and repo address.
158
- def clear_files():
159
- return None, None, ""
 
 
 
 
160
 
161
- # Gradio interface setup
162
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
163
  with gr.Row():
164
- api_key_input = gr.Textbox(label="API Key", placeholder="Enter API Key")
165
- repo_address_input = gr.Textbox(label="Repo Address", placeholder="Enter Repo Address")
166
  with gr.Row():
167
- pdf_file_input = gr.File(label="Upload PDF")
168
- convert_button = gr.Button("Convert to Parquet")
169
- clear_button = gr.Button("Clear Files")
170
  with gr.Row():
171
- download_file_output = gr.File(label="Download Parquet File")
172
- repo_status_output = gr.Textbox(label="Repo Upload Status")
 
 
 
 
 
173
 
174
  convert_button.click(
175
- process_pdf_file,
176
- inputs=[pdf_file_input, api_key_input, repo_address_input],
177
- outputs=[download_file_output, repo_status_output]
178
  )
179
- clear_button.click(
180
- clear_files,
 
181
  inputs=None,
182
- outputs=[pdf_file_input, download_file_output, repo_status_output]
 
 
 
 
 
 
 
 
 
 
183
  )
184
 
185
  demo.launch()
186
 
 
 
8
  import time
9
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
+ print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
  content = ""
14
 
 
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
31
+ code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
32
  reference_keywords = ['reference', 'bibliography', 'sources']
33
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
34
 
 
122
  "content": content
123
  }
124
 
125
+ def upload_with_progress(file_path, repo_id, token, progress):
126
+ """
127
+ Upload file to Hugging Face Dataset using upload_file() API method.
128
+ """
129
+ print(f"πŸ“€ Starting upload of Parquet: {file_path}")
130
+ file_size = os.path.getsize(file_path)
131
+
132
+ api = HfApi()
133
+
134
+ try:
135
+ # Use upload_file() method from huggingface_hub
136
+ api.upload_file(
137
+ path_or_fileobj=file_path,
138
+ path_in_repo=os.path.basename(file_path),
139
+ repo_id=repo_id,
140
+ repo_type="dataset",
141
+ token=token
142
+ )
143
+
144
+ if progress is not None:
145
+ progress(1, desc="βœ… Upload Complete")
146
+
147
+ print(f"βœ… Successfully uploaded to {repo_id}")
148
+ return f"βœ… Successfully uploaded to {repo_id}"
149
+
150
+ except HfHubHTTPError as e:
151
+ print(f"❌ Upload failed: {e}")
152
+ return f"❌ Upload failed: {str(e)}"
153
+ except Exception as e:
154
+ print(f"❌ Unexpected error: {e}")
155
+ return f"❌ Unexpected error: {str(e)}"
156
+
157
+ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
158
+ all_data = []
159
+
160
+ total_files = len(pdf_files)
161
+ print("πŸš€ Starting PDF to Parquet Conversion Process")
162
+
163
+ for idx, pdf_file in enumerate(pdf_files):
164
+ if progress is not None:
165
+ progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
166
+
167
+ # βœ… Step 1: Process PDF with Full Labels
168
+ extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
169
+ all_data.append(extracted_data)
170
+
171
+ print("🟑 Converting Processed Data to Parquet")
172
+ # βœ… Step 2: Convert to Parquet
173
+ df = pd.DataFrame(all_data)
174
+ parquet_file = 'fully_labeled_papers.parquet'
175
+
176
+ try:
177
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
178
+ print("βœ… Parquet Conversion Completed")
179
+ except Exception as e:
180
+ print(f"❌ Parquet Conversion Failed: {str(e)}")
181
+ return None, f"❌ Parquet Conversion Failed: {str(e)}"
182
+
183
+ upload_message = "Skipped Upload"
184
+
185
+ # βœ… Step 3: Upload Parquet (if selected)
186
+ if action_choice in ["Upload to Hugging Face", "Both"]:
187
  try:
188
+ upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
 
 
 
 
 
 
189
  except Exception as e:
190
+ print(f"❌ Upload Failed: {str(e)}")
191
+ upload_message = f"❌ Upload failed: {str(e)}"
 
 
 
192
 
193
+ print("🏁 Process Completed")
194
+ return parquet_file, upload_message
195
+
196
+ # Define a function for our custom "Reset Files Only" button.
197
+ def reset_files_fn():
198
+ # Return None for both the file input and the output file, clearing them.
199
+ return None, None
200
 
 
201
  with gr.Blocks() as demo:
202
+ gr.Markdown(
203
+ """
204
+ # PDF to Parquet Converter with Full Labeling
205
+
206
+ **Clear All Inputs:** The button below (labeled "Clear All Inputs") will reset every field, including your API key and dataset repo ID.
207
+ **Reset Files Only:** Use this button if you want to clear the PDF file uploads and the generated Parquet file, while keeping your credentials intact.
208
+ """
209
+ )
210
+
211
+ with gr.Row():
212
+ pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)")
213
  with gr.Row():
214
+ hf_token = gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token")
215
+ dataset_repo = gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset")
216
  with gr.Row():
217
+ action_radio = gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
218
+
 
219
  with gr.Row():
220
+ convert_button = gr.Button("Convert PDF to Parquet")
221
+ reset_files_button = gr.Button("Reset Files Only")
222
+ clear_all_button = gr.Button("Clear All Inputs")
223
+
224
+ with gr.Row():
225
+ output_file = gr.File(label="Download Parquet File")
226
+ status_text = gr.Textbox(label="Status")
227
 
228
  convert_button.click(
229
+ fn=pdf_to_parquet_and_upload,
230
+ inputs=[pdf_input, hf_token, dataset_repo, action_radio],
231
+ outputs=[output_file, status_text]
232
  )
233
+
234
+ reset_files_button.click(
235
+ fn=reset_files_fn,
236
  inputs=None,
237
+ outputs=[pdf_input, output_file]
238
+ )
239
+
240
+ # The Clear All button resets every input field.
241
+ def clear_all_fn():
242
+ return None, None, None, "Download Locally"
243
+
244
+ clear_all_button.click(
245
+ fn=clear_all_fn,
246
+ inputs=None,
247
+ outputs=[pdf_input, hf_token, dataset_repo, action_radio]
248
  )
249
 
250
  demo.launch()
251
 
252
+