Jobey1 commited on
Commit
9323459
Β·
verified Β·
1 Parent(s): 9433534

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -55
app.py CHANGED
@@ -7,6 +7,11 @@ from huggingface_hub import HfApi
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
 
 
 
 
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
@@ -98,7 +103,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
98
  elif re.search(r"=|βˆ‘|√|Β±|Γ—|Ο€|ΞΌ|Οƒ", text):
99
  content += f"<EQUATION>{text}</EQUATION>\n"
100
 
101
- # βœ… Improved Code Block Detection
102
  elif re.search(code_pattern, text) and len(text.split()) <= 50:
103
  content += f"<CODE>{text}</CODE>\n"
104
 
@@ -119,44 +124,12 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
119
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
120
  return {
121
  "filename": os.path.basename(pdf_path),
 
122
  "content": content
123
  }
124
 
125
-
126
- def upload_with_progress(file_path, repo_id, token, progress):
127
- """
128
- Upload file to Hugging Face Dataset using upload_file() API method.
129
- """
130
- print(f"πŸ“€ Starting upload of Parquet: {file_path}")
131
- file_size = os.path.getsize(file_path)
132
-
133
- api = HfApi()
134
-
135
- try:
136
- # Use upload_file() method from huggingface_hub
137
- api.upload_file(
138
- path_or_fileobj=file_path,
139
- path_in_repo=os.path.basename(file_path),
140
- repo_id=repo_id,
141
- repo_type="dataset",
142
- token=token
143
- )
144
-
145
- if progress is not None:
146
- progress(1, desc="βœ… Upload Complete")
147
-
148
- print(f"βœ… Successfully uploaded to {repo_id}")
149
- return f"βœ… Successfully uploaded to {repo_id}"
150
-
151
- except HfHubHTTPError as e:
152
- print(f"❌ Upload failed: {e}")
153
- return f"❌ Upload failed: {str(e)}"
154
- except Exception as e:
155
- print(f"❌ Unexpected error: {e}")
156
- return f"❌ Unexpected error: {str(e)}"
157
-
158
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
159
- all_data = []
160
 
161
  total_files = len(pdf_files)
162
  print("πŸš€ Starting PDF to Parquet Conversion Process")
@@ -167,29 +140,28 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
167
 
168
  # βœ… Step 1: Process PDF with Full Labels
169
  extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
170
- all_data.append(extracted_data)
171
 
172
- print("🟑 Converting Processed Data to Parquet")
173
- # βœ… Step 2: Convert to Parquet
174
- df = pd.DataFrame(all_data)
175
- parquet_file = 'fully_labeled_papers.parquet'
176
 
177
- try:
178
- df.to_parquet(parquet_file, engine='pyarrow', index=False)
179
- print("βœ… Parquet Conversion Completed")
180
- except Exception as e:
181
- print(f"❌ Parquet Conversion Failed: {str(e)}")
182
- return None, f"❌ Parquet Conversion Failed: {str(e)}"
183
 
184
- upload_message = "Skipped Upload"
185
-
186
- # βœ… Step 3: Upload Parquet (if selected)
187
- if action_choice in ["Upload to Hugging Face", "Both"]:
188
  try:
189
- upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
 
190
  except Exception as e:
191
- print(f"❌ Upload Failed: {str(e)}")
192
- upload_message = f"❌ Upload failed: {str(e)}"
 
 
 
 
 
 
 
 
193
 
194
  print("🏁 Process Completed")
195
  return parquet_file, upload_message
@@ -207,9 +179,10 @@ iface = gr.Interface(
207
  gr.File(label="Download Parquet File"),
208
  gr.Textbox(label="Status")
209
  ],
210
- title="PDF to Parquet Converter with Full Labeling",
211
- description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
212
  )
213
 
214
  iface.launch()
215
 
 
 
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
10
+ def sanitize_filename(title):
11
+ # Remove invalid characters and replace spaces with underscores
12
+ sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
13
+ return sanitized.replace(" ", "_")
14
+
15
  def extract_full_paper_with_labels(pdf_path, progress=None):
16
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
17
  doc = fitz.open(pdf_path)
 
103
  elif re.search(r"=|βˆ‘|√|Β±|Γ—|Ο€|ΞΌ|Οƒ", text):
104
  content += f"<EQUATION>{text}</EQUATION>\n"
105
 
106
+ # Code Blocks (enhanced detection)
107
  elif re.search(code_pattern, text) and len(text.split()) <= 50:
108
  content += f"<CODE>{text}</CODE>\n"
109
 
 
124
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
125
  return {
126
  "filename": os.path.basename(pdf_path),
127
+ "title": title if title else "Untitled_Paper",
128
  "content": content
129
  }
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
132
+ upload_message = ""
133
 
134
  total_files = len(pdf_files)
135
  print("πŸš€ Starting PDF to Parquet Conversion Process")
 
140
 
141
  # βœ… Step 1: Process PDF with Full Labels
142
  extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
 
143
 
144
+ # βœ… Step 2: Use Title for Parquet Filename
145
+ sanitized_title = sanitize_filename(extracted_data["title"])
146
+ parquet_file = f"{sanitized_title}.parquet"
 
147
 
148
+ # Convert to DataFrame
149
+ df = pd.DataFrame([extracted_data])
 
 
 
 
150
 
 
 
 
 
151
  try:
152
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
153
+ print(f"βœ… Parquet saved as: {parquet_file}")
154
  except Exception as e:
155
+ print(f"❌ Parquet Conversion Failed: {str(e)}")
156
+ return None, f"❌ Parquet Conversion Failed: {str(e)}"
157
+
158
+ # βœ… Step 3: Upload Parquet (if selected)
159
+ if action_choice in ["Upload to Hugging Face", "Both"]:
160
+ try:
161
+ upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
162
+ except Exception as e:
163
+ print(f"❌ Upload Failed: {str(e)}")
164
+ upload_message = f"❌ Upload failed: {str(e)}"
165
 
166
  print("🏁 Process Completed")
167
  return parquet_file, upload_message
 
179
  gr.File(label="Download Parquet File"),
180
  gr.Textbox(label="Status")
181
  ],
182
+ title="PDF to Parquet Converter with Title-Based Naming",
183
+ description="Upload your PDFs, convert them to Parquet files named after the paper title, and upload to your Hugging Face Dataset."
184
  )
185
 
186
  iface.launch()
187
 
188
+