Jobey1 commited on
Commit
06449e7
Β·
verified Β·
1 Parent(s): ad0b1f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -17
app.py CHANGED
@@ -8,12 +8,20 @@ import requests
8
  import time
9
 
10
  def extract_paragraphs_with_headers(pdf_path, progress=None):
 
11
  doc = fitz.open(pdf_path)
12
  data = []
13
 
14
  total_pages = len(doc)
 
 
 
15
  for page_num, page in enumerate(doc):
16
- if progress:
 
 
 
 
17
  progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
18
 
19
  blocks = page.get_text("dict")["blocks"]
@@ -35,12 +43,14 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
35
  "is_header": is_header
36
  })
37
 
 
38
  return data
39
 
40
  def upload_with_progress(file_path, repo_id, token, progress):
41
  """
42
  Upload file to Hugging Face Dataset with progress tracking.
43
  """
 
44
  file_size = os.path.getsize(file_path)
45
  url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
46
 
@@ -51,6 +61,8 @@ def upload_with_progress(file_path, repo_id, token, progress):
51
  with open(file_path, 'rb') as f:
52
  chunk_size = 1024 * 1024 # 1MB
53
  uploaded = 0
 
 
54
 
55
  while True:
56
  chunk = f.read(chunk_size)
@@ -64,19 +76,31 @@ def upload_with_progress(file_path, repo_id, token, progress):
64
  )
65
 
66
  uploaded += len(chunk)
67
- progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
68
- time.sleep(0.1) # Simulate delay for progress update
 
 
 
 
 
69
 
70
  if response.status_code != 200:
71
- raise Exception(f"Upload failed: {response.text}")
72
 
 
73
  return f"βœ… Successfully uploaded to {repo_id}"
74
 
75
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
76
  all_data = []
77
 
78
- # Process each uploaded PDF
79
- for pdf_file in pdf_files:
 
 
 
 
 
 
80
  extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
81
  for item in extracted_data:
82
  all_data.append({
@@ -86,26 +110,32 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
86
  'is_header': item['is_header']
87
  })
88
 
89
- # Convert to DataFrame
 
90
  df = pd.DataFrame(all_data)
91
-
92
- # Save as Parquet
93
  parquet_file = 'papers_with_headers.parquet'
94
- df.to_parquet(parquet_file, engine='pyarrow', index=False)
95
 
96
- upload_message = ""
 
 
 
 
 
97
 
98
- # Only upload if the user selects it
 
 
99
  if action_choice in ["Upload to Hugging Face", "Both"]:
100
  try:
101
  upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
102
  except Exception as e:
 
103
  upload_message = f"❌ Upload failed: {str(e)}"
104
 
105
- # Return Parquet file and status message
106
  return parquet_file, upload_message
107
 
108
- # Gradio Interface
109
  iface = gr.Interface(
110
  fn=pdf_to_parquet_and_upload,
111
  inputs=[
@@ -118,9 +148,8 @@ iface = gr.Interface(
118
  gr.File(label="Download Parquet File"),
119
  gr.Textbox(label="Status")
120
  ],
121
- title="PDF to Parquet Converter with Upload Progress",
122
- description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo with real-time progress tracking."
123
  )
124
 
125
  iface.launch()
126
-
 
8
  import time
9
 
10
  def extract_paragraphs_with_headers(pdf_path, progress=None):
11
+ print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
12
  doc = fitz.open(pdf_path)
13
  data = []
14
 
15
  total_pages = len(doc)
16
+ max_iterations = total_pages * 2 # To prevent infinite loops
17
+ iteration_count = 0
18
+
19
  for page_num, page in enumerate(doc):
20
+ iteration_count += 1
21
+ if iteration_count > max_iterations:
22
+ raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")
23
+
24
+ if progress is not None:
25
  progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
26
 
27
  blocks = page.get_text("dict")["blocks"]
 
43
  "is_header": is_header
44
  })
45
 
46
+ print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
47
  return data
48
 
49
  def upload_with_progress(file_path, repo_id, token, progress):
50
  """
51
  Upload file to Hugging Face Dataset with progress tracking.
52
  """
53
+ print(f"πŸ“€ Starting upload of Parquet: {file_path}")
54
  file_size = os.path.getsize(file_path)
55
  url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
56
 
 
61
  with open(file_path, 'rb') as f:
62
  chunk_size = 1024 * 1024 # 1MB
63
  uploaded = 0
64
+ max_chunks = file_size // chunk_size + 10 # Safety limit to avoid infinite loops
65
+ chunk_count = 0
66
 
67
  while True:
68
  chunk = f.read(chunk_size)
 
76
  )
77
 
78
  uploaded += len(chunk)
79
+ if progress is not None:
80
+ progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
81
+ time.sleep(0.1) # Smooth progress update
82
+
83
+ chunk_count += 1
84
+ if chunk_count > max_chunks:
85
+ raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")
86
 
87
  if response.status_code != 200:
88
+ raise Exception(f"❌ Upload failed: {response.text}")
89
 
90
+ print(f"βœ… Successfully uploaded to {repo_id}")
91
  return f"βœ… Successfully uploaded to {repo_id}"
92
 
93
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
94
  all_data = []
95
 
96
+ total_files = len(pdf_files)
97
+ print("πŸš€ Starting PDF to Parquet Conversion Process")
98
+
99
+ for idx, pdf_file in enumerate(pdf_files):
100
+ if progress is not None:
101
+ progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
102
+
103
+ # βœ… Step 1: Process PDF
104
  extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
105
  for item in extracted_data:
106
  all_data.append({
 
110
  'is_header': item['is_header']
111
  })
112
 
113
+ print("🟑 Converting Processed Data to Parquet")
114
+ # βœ… Step 2: Convert to Parquet
115
  df = pd.DataFrame(all_data)
 
 
116
  parquet_file = 'papers_with_headers.parquet'
 
117
 
118
+ try:
119
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
120
+ print("βœ… Parquet Conversion Completed")
121
+ except Exception as e:
122
+ print(f"❌ Parquet Conversion Failed: {str(e)}")
123
+ return None, f"❌ Parquet Conversion Failed: {str(e)}"
124
 
125
+ upload_message = "Skipped Upload"
126
+
127
+ # βœ… Step 3: Upload Parquet (if selected)
128
  if action_choice in ["Upload to Hugging Face", "Both"]:
129
  try:
130
  upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
131
  except Exception as e:
132
+ print(f"❌ Upload Failed: {str(e)}")
133
  upload_message = f"❌ Upload failed: {str(e)}"
134
 
135
+ print("🏁 Process Completed")
136
  return parquet_file, upload_message
137
 
138
+ # βœ… Gradio Interface
139
  iface = gr.Interface(
140
  fn=pdf_to_parquet_and_upload,
141
  inputs=[
 
148
  gr.File(label="Download Parquet File"),
149
  gr.Textbox(label="Status")
150
  ],
151
+ title="PDF to Parquet Converter with Detailed Progress",
152
+ description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
153
  )
154
 
155
  iface.launch()