Update app.py
Browse files
app.py
CHANGED
|
@@ -8,12 +8,20 @@ import requests
|
|
| 8 |
import time
|
| 9 |
|
| 10 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
|
|
| 11 |
doc = fitz.open(pdf_path)
|
| 12 |
data = []
|
| 13 |
|
| 14 |
total_pages = len(doc)
|
|
|
|
|
|
|
|
|
|
| 15 |
for page_num, page in enumerate(doc):
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
|
| 18 |
|
| 19 |
blocks = page.get_text("dict")["blocks"]
|
|
@@ -35,12 +43,14 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
| 35 |
"is_header": is_header
|
| 36 |
})
|
| 37 |
|
|
|
|
| 38 |
return data
|
| 39 |
|
| 40 |
def upload_with_progress(file_path, repo_id, token, progress):
|
| 41 |
"""
|
| 42 |
Upload file to Hugging Face Dataset with progress tracking.
|
| 43 |
"""
|
|
|
|
| 44 |
file_size = os.path.getsize(file_path)
|
| 45 |
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
| 46 |
|
|
@@ -51,6 +61,8 @@ def upload_with_progress(file_path, repo_id, token, progress):
|
|
| 51 |
with open(file_path, 'rb') as f:
|
| 52 |
chunk_size = 1024 * 1024 # 1MB
|
| 53 |
uploaded = 0
|
|
|
|
|
|
|
| 54 |
|
| 55 |
while True:
|
| 56 |
chunk = f.read(chunk_size)
|
|
@@ -64,19 +76,31 @@ def upload_with_progress(file_path, repo_id, token, progress):
|
|
| 64 |
)
|
| 65 |
|
| 66 |
uploaded += len(chunk)
|
| 67 |
-
progress
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
if response.status_code != 200:
|
| 71 |
-
raise Exception(f"Upload failed: {response.text}")
|
| 72 |
|
|
|
|
| 73 |
return f"β
Successfully uploaded to {repo_id}"
|
| 74 |
|
| 75 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
| 76 |
all_data = []
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
|
| 81 |
for item in extracted_data:
|
| 82 |
all_data.append({
|
|
@@ -86,26 +110,32 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
| 86 |
'is_header': item['is_header']
|
| 87 |
})
|
| 88 |
|
| 89 |
-
|
|
|
|
| 90 |
df = pd.DataFrame(all_data)
|
| 91 |
-
|
| 92 |
-
# Save as Parquet
|
| 93 |
parquet_file = 'papers_with_headers.parquet'
|
| 94 |
-
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
|
|
|
|
|
|
| 99 |
if action_choice in ["Upload to Hugging Face", "Both"]:
|
| 100 |
try:
|
| 101 |
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
| 102 |
except Exception as e:
|
|
|
|
| 103 |
upload_message = f"β Upload failed: {str(e)}"
|
| 104 |
|
| 105 |
-
|
| 106 |
return parquet_file, upload_message
|
| 107 |
|
| 108 |
-
# Gradio Interface
|
| 109 |
iface = gr.Interface(
|
| 110 |
fn=pdf_to_parquet_and_upload,
|
| 111 |
inputs=[
|
|
@@ -118,9 +148,8 @@ iface = gr.Interface(
|
|
| 118 |
gr.File(label="Download Parquet File"),
|
| 119 |
gr.Textbox(label="Status")
|
| 120 |
],
|
| 121 |
-
title="PDF to Parquet Converter with
|
| 122 |
-
description="Upload your PDFs
|
| 123 |
)
|
| 124 |
|
| 125 |
iface.launch()
|
| 126 |
-
|
|
|
|
| 8 |
import time
|
| 9 |
|
| 10 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
| 11 |
+
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
| 12 |
doc = fitz.open(pdf_path)
|
| 13 |
data = []
|
| 14 |
|
| 15 |
total_pages = len(doc)
|
| 16 |
+
max_iterations = total_pages * 2 # To prevent infinite loops
|
| 17 |
+
iteration_count = 0
|
| 18 |
+
|
| 19 |
for page_num, page in enumerate(doc):
|
| 20 |
+
iteration_count += 1
|
| 21 |
+
if iteration_count > max_iterations:
|
| 22 |
+
raise Exception("β οΈ PDF processing exceeded iteration limit. Possible malformed PDF.")
|
| 23 |
+
|
| 24 |
+
if progress is not None:
|
| 25 |
progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
|
| 26 |
|
| 27 |
blocks = page.get_text("dict")["blocks"]
|
|
|
|
| 43 |
"is_header": is_header
|
| 44 |
})
|
| 45 |
|
| 46 |
+
print(f"β
Finished Processing PDF: {os.path.basename(pdf_path)}")
|
| 47 |
return data
|
| 48 |
|
| 49 |
def upload_with_progress(file_path, repo_id, token, progress):
|
| 50 |
"""
|
| 51 |
Upload file to Hugging Face Dataset with progress tracking.
|
| 52 |
"""
|
| 53 |
+
print(f"π€ Starting upload of Parquet: {file_path}")
|
| 54 |
file_size = os.path.getsize(file_path)
|
| 55 |
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
| 56 |
|
|
|
|
| 61 |
with open(file_path, 'rb') as f:
|
| 62 |
chunk_size = 1024 * 1024 # 1MB
|
| 63 |
uploaded = 0
|
| 64 |
+
max_chunks = file_size // chunk_size + 10 # Safety limit to avoid infinite loops
|
| 65 |
+
chunk_count = 0
|
| 66 |
|
| 67 |
while True:
|
| 68 |
chunk = f.read(chunk_size)
|
|
|
|
| 76 |
)
|
| 77 |
|
| 78 |
uploaded += len(chunk)
|
| 79 |
+
if progress is not None:
|
| 80 |
+
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
|
| 81 |
+
time.sleep(0.1) # Smooth progress update
|
| 82 |
+
|
| 83 |
+
chunk_count += 1
|
| 84 |
+
if chunk_count > max_chunks:
|
| 85 |
+
raise Exception("β οΈ Upload exceeded expected chunk limit. Aborting.")
|
| 86 |
|
| 87 |
if response.status_code != 200:
|
| 88 |
+
raise Exception(f"β Upload failed: {response.text}")
|
| 89 |
|
| 90 |
+
print(f"β
Successfully uploaded to {repo_id}")
|
| 91 |
return f"β
Successfully uploaded to {repo_id}"
|
| 92 |
|
| 93 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
| 94 |
all_data = []
|
| 95 |
|
| 96 |
+
total_files = len(pdf_files)
|
| 97 |
+
print("π Starting PDF to Parquet Conversion Process")
|
| 98 |
+
|
| 99 |
+
for idx, pdf_file in enumerate(pdf_files):
|
| 100 |
+
if progress is not None:
|
| 101 |
+
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
| 102 |
+
|
| 103 |
+
# β
Step 1: Process PDF
|
| 104 |
extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
|
| 105 |
for item in extracted_data:
|
| 106 |
all_data.append({
|
|
|
|
| 110 |
'is_header': item['is_header']
|
| 111 |
})
|
| 112 |
|
| 113 |
+
print("π‘ Converting Processed Data to Parquet")
|
| 114 |
+
# β
Step 2: Convert to Parquet
|
| 115 |
df = pd.DataFrame(all_data)
|
|
|
|
|
|
|
| 116 |
parquet_file = 'papers_with_headers.parquet'
|
|
|
|
| 117 |
|
| 118 |
+
try:
|
| 119 |
+
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
| 120 |
+
print("β
Parquet Conversion Completed")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"β Parquet Conversion Failed: {str(e)}")
|
| 123 |
+
return None, f"β Parquet Conversion Failed: {str(e)}"
|
| 124 |
|
| 125 |
+
upload_message = "Skipped Upload"
|
| 126 |
+
|
| 127 |
+
# β
Step 3: Upload Parquet (if selected)
|
| 128 |
if action_choice in ["Upload to Hugging Face", "Both"]:
|
| 129 |
try:
|
| 130 |
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
| 131 |
except Exception as e:
|
| 132 |
+
print(f"β Upload Failed: {str(e)}")
|
| 133 |
upload_message = f"β Upload failed: {str(e)}"
|
| 134 |
|
| 135 |
+
print("π Process Completed")
|
| 136 |
return parquet_file, upload_message
|
| 137 |
|
| 138 |
+
# β
Gradio Interface
|
| 139 |
iface = gr.Interface(
|
| 140 |
fn=pdf_to_parquet_and_upload,
|
| 141 |
inputs=[
|
|
|
|
| 148 |
gr.File(label="Download Parquet File"),
|
| 149 |
gr.Textbox(label="Status")
|
| 150 |
],
|
| 151 |
+
title="PDF to Parquet Converter with Detailed Progress",
|
| 152 |
+
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
|
| 153 |
)
|
| 154 |
|
| 155 |
iface.launch()
|
|
|