Jobey1 commited on
Commit
c8cd30b
Β·
verified Β·
1 Parent(s): 9d4e756

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -73
app.py CHANGED
@@ -6,8 +6,6 @@ import re
6
  from huggingface_hub import HfApi
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
- import hashlib
10
- import requests
11
 
12
  def extract_full_paper_with_labels(pdf_path, progress=None):
13
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
@@ -100,7 +98,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
100
  elif re.search(r"=|βˆ‘|√|Β±|Γ—|Ο€|ΞΌ|Οƒ", text):
101
  content += f"<EQUATION>{text}</EQUATION>\n"
102
 
103
- # Code Blocks (enhanced detection)
104
  elif re.search(code_pattern, text) and len(text.split()) <= 50:
105
  content += f"<CODE>{text}</CODE>\n"
106
 
@@ -121,49 +119,28 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
121
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
122
  return {
123
  "filename": os.path.basename(pdf_path),
124
- "title": title if title else "Untitled_Paper",
125
  "content": content
126
  }
127
 
128
-
129
  def upload_with_progress(file_path, repo_id, token, progress):
130
  """
131
- Upload file to Hugging Face Dataset with progress tracking.
132
  """
133
-
134
  print(f"πŸ“€ Starting upload of Parquet: {file_path}")
135
  file_size = os.path.getsize(file_path)
136
- api = HfApi()
137
 
138
- # Get the proper upload URL from the Hugging Face API
139
- upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
140
 
141
  try:
142
- with open(file_path, 'rb') as f:
143
- chunk_size = 1024 * 1024 # 1 MB chunks
144
- uploaded = 0
145
-
146
- headers = {
147
- "Authorization": f"Bearer {token}",
148
- "Content-Type": "application/octet-stream"
149
- }
150
 
151
- while True:
152
- chunk = f.read(chunk_size)
153
- if not chunk:
154
- break # Finished reading file
155
-
156
- response = requests.put(upload_url, headers=headers, data=chunk)
157
-
158
- if response.status_code != 200:
159
- raise Exception(f"Upload failed: {response.text}")
160
-
161
- # Update progress after each chunk
162
- uploaded += len(chunk)
163
- if progress is not None:
164
- progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
165
-
166
- # Final progress update
167
  if progress is not None:
168
  progress(1, desc="βœ… Upload Complete")
169
 
@@ -177,27 +154,8 @@ def upload_with_progress(file_path, repo_id, token, progress):
177
  print(f"❌ Unexpected error: {e}")
178
  return f"❌ Unexpected error: {str(e)}"
179
 
180
-
181
-
182
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
183
- upload_message = ""
184
-
185
- # βœ… Helper function inside this block to avoid external edits
186
- def sanitize_filename(title, max_length=100):
187
- """
188
- Sanitize and truncate the filename to avoid OS limits.
189
- """
190
- # Remove invalid characters
191
- sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
192
- sanitized = sanitized.replace(" ", "_")
193
-
194
- # Truncate to max_length if necessary
195
- if len(sanitized) > max_length:
196
- # Append an 8-character hash for uniqueness
197
- hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:8]
198
- sanitized = sanitized[:max_length] + "_" + hash_suffix
199
-
200
- return sanitized
201
 
202
  total_files = len(pdf_files)
203
  print("πŸš€ Starting PDF to Parquet Conversion Process")
@@ -208,28 +166,29 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
208
 
209
  # βœ… Step 1: Process PDF with Full Labels
210
  extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
 
211
 
212
- # βœ… Step 2: Use Title for Parquet Filename with Truncation & Hash
213
- sanitized_title = sanitize_filename(extracted_data["title"])
214
- parquet_file = f"{sanitized_title}.parquet"
 
215
 
216
- # Convert to DataFrame
217
- df = pd.DataFrame([extracted_data])
 
 
 
 
 
 
218
 
 
 
219
  try:
220
- df.to_parquet(parquet_file, engine='pyarrow', index=False)
221
- print(f"βœ… Parquet saved as: {parquet_file}")
222
  except Exception as e:
223
- print(f"❌ Parquet Conversion Failed: {str(e)}")
224
- return None, f"❌ Parquet Conversion Failed: {str(e)}"
225
-
226
- # βœ… Step 3: Upload Parquet (if selected)
227
- if action_choice in ["Upload to Hugging Face", "Both"]:
228
- try:
229
- upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
230
- except Exception as e:
231
- print(f"❌ Upload Failed: {str(e)}")
232
- upload_message = f"❌ Upload failed: {str(e)}"
233
 
234
  print("🏁 Process Completed")
235
  return parquet_file, upload_message
@@ -247,10 +206,11 @@ iface = gr.Interface(
247
  gr.File(label="Download Parquet File"),
248
  gr.Textbox(label="Status")
249
  ],
250
- title="PDF to Parquet Converter with Title-Based Naming",
251
- description="Upload your PDFs, convert them to Parquet files named after the paper title, and upload to your Hugging Face Dataset."
252
  )
253
 
254
  iface.launch()
255
 
256
 
 
 
6
  from huggingface_hub import HfApi
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
 
 
9
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
 
98
  elif re.search(r"=|βˆ‘|√|Β±|Γ—|Ο€|ΞΌ|Οƒ", text):
99
  content += f"<EQUATION>{text}</EQUATION>\n"
100
 
101
+ # βœ… Improved Code Block Detection
102
  elif re.search(code_pattern, text) and len(text.split()) <= 50:
103
  content += f"<CODE>{text}</CODE>\n"
104
 
 
119
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
120
  return {
121
  "filename": os.path.basename(pdf_path),
 
122
  "content": content
123
  }
124
 
 
125
  def upload_with_progress(file_path, repo_id, token, progress):
126
  """
127
+ Upload file to Hugging Face Dataset using upload_file() API method.
128
  """
 
129
  print(f"πŸ“€ Starting upload of Parquet: {file_path}")
130
  file_size = os.path.getsize(file_path)
 
131
 
132
+ api = HfApi()
 
133
 
134
  try:
135
+ # Use upload_file() method from huggingface_hub
136
+ api.upload_file(
137
+ path_or_fileobj=file_path,
138
+ path_in_repo=os.path.basename(file_path),
139
+ repo_id=repo_id,
140
+ repo_type="dataset",
141
+ token=token
142
+ )
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  if progress is not None:
145
  progress(1, desc="βœ… Upload Complete")
146
 
 
154
  print(f"❌ Unexpected error: {e}")
155
  return f"❌ Unexpected error: {str(e)}"
156
 
 
 
157
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
158
+ all_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  total_files = len(pdf_files)
161
  print("πŸš€ Starting PDF to Parquet Conversion Process")
 
166
 
167
  # βœ… Step 1: Process PDF with Full Labels
168
  extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
169
+ all_data.append(extracted_data)
170
 
171
+ print("🟑 Converting Processed Data to Parquet")
172
+ # βœ… Step 2: Convert to Parquet
173
+ df = pd.DataFrame(all_data)
174
+ parquet_file = 'fully_labeled_papers.parquet'
175
 
176
+ try:
177
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
178
+ print("βœ… Parquet Conversion Completed")
179
+ except Exception as e:
180
+ print(f"❌ Parquet Conversion Failed: {str(e)}")
181
+ return None, f"❌ Parquet Conversion Failed: {str(e)}"
182
+
183
+ upload_message = "Skipped Upload"
184
 
185
+ # βœ… Step 3: Upload Parquet (if selected)
186
+ if action_choice in ["Upload to Hugging Face", "Both"]:
187
  try:
188
+ upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
 
189
  except Exception as e:
190
+ print(f"❌ Upload Failed: {str(e)}")
191
+ upload_message = f"❌ Upload failed: {str(e)}"
 
 
 
 
 
 
 
 
192
 
193
  print("🏁 Process Completed")
194
  return parquet_file, upload_message
 
206
  gr.File(label="Download Parquet File"),
207
  gr.Textbox(label="Status")
208
  ],
209
+ title="PDF to Parquet Converter with Full Labeling",
210
+ description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
211
  )
212
 
213
  iface.launch()
214
 
215
 
216
+