Jobey1 commited on
Commit
14cda2c
Β·
verified Β·
1 Parent(s): 9bea774

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -3
app.py CHANGED
@@ -7,6 +7,18 @@ from huggingface_hub import HfApi
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def extract_full_paper_with_labels(pdf_path, progress=None):
11
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
@@ -119,6 +131,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
119
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
120
  return {
121
  "filename": os.path.basename(pdf_path),
 
122
  "content": content
123
  }
124
 
@@ -171,7 +184,18 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
171
  print("🟑 Converting Processed Data to Parquet")
172
  # βœ… Step 2: Convert to Parquet
173
  df = pd.DataFrame(all_data)
174
- parquet_file = 'fully_labeled_papers.parquet'
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  try:
177
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
@@ -248,5 +272,3 @@ with gr.Blocks() as demo:
248
  )
249
 
250
  demo.launch()
251
-
252
-
 
7
  from huggingface_hub.utils import HfHubHTTPError
8
  import time
9
 
10
+ def sanitize_title(title, max_length=100):
11
+ """
12
+ Sanitize the paper title to be safe for use as a filename.
13
+ Removes non-alphanumeric characters (except underscores and hyphens)
14
+ and truncates to max_length characters.
15
+ """
16
+ sanitized = re.sub(r'[^\w\s-]', '', title).strip() # Remove unwanted characters
17
+ sanitized = re.sub(r'[-\s]+', '_', sanitized) # Replace spaces and hyphens with underscores
18
+ if len(sanitized) > max_length:
19
+ sanitized = sanitized[:max_length]
20
+ return sanitized
21
+
22
  def extract_full_paper_with_labels(pdf_path, progress=None):
23
  print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
24
  doc = fitz.open(pdf_path)
 
131
  print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
132
  return {
133
  "filename": os.path.basename(pdf_path),
134
+ "title": title, # Include the title in the return data
135
  "content": content
136
  }
137
 
 
184
  print("🟑 Converting Processed Data to Parquet")
185
  # βœ… Step 2: Convert to Parquet
186
  df = pd.DataFrame(all_data)
187
+
188
+ # Generate the parquet file name
189
+ if len(all_data) == 1:
190
+ paper_title = all_data[0].get("title", "").strip()
191
+ if paper_title:
192
+ safe_title = sanitize_title(paper_title)
193
+ parquet_file = f"{safe_title}.parquet"
194
+ else:
195
+ parquet_file = 'fully_labeled_papers.parquet'
196
+ else:
197
+ # For multiple PDFs, include a timestamp to avoid overwrites
198
+ parquet_file = f"fully_labeled_papers_{time.strftime('%Y%m%d_%H%M%S')}.parquet"
199
 
200
  try:
201
  df.to_parquet(parquet_file, engine='pyarrow', index=False)
 
272
  )
273
 
274
  demo.launch()