Jobey1 commited on
Commit
7998543
·
verified ·
1 Parent(s): fb7ac68

Update app.py

Browse files

Added progress bar for upload. Trying to fix issue with outdated code that exports file to data set.

Files changed (1) hide show
  1. app.py +52 -32
app.py CHANGED
@@ -2,15 +2,19 @@ import gradio as gr
2
  import pandas as pd
3
  import fitz # PyMuPDF
4
  import os
5
- from huggingface_hub import HfApi
6
- from huggingface_hub.utils import HfHubHTTPError
 
7
 
8
-
9
- def extract_paragraphs_with_headers(pdf_path):
10
  doc = fitz.open(pdf_path)
11
  data = []
12
 
 
13
  for page_num, page in enumerate(doc):
 
 
 
14
  blocks = page.get_text("dict")["blocks"]
15
  for block in blocks:
16
  if "lines" in block:
@@ -18,7 +22,7 @@ def extract_paragraphs_with_headers(pdf_path):
18
  for line in block["lines"]:
19
  for span in line["spans"]:
20
  text += span["text"] + " "
21
-
22
  text = text.strip()
23
 
24
  # Detect headers based on font size
@@ -32,12 +36,47 @@ def extract_paragraphs_with_headers(pdf_path):
32
 
33
  return data
34
 
35
- def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  all_data = []
37
 
 
38
  for pdf_file in pdf_files:
39
- extracted_data = extract_paragraphs_with_headers(pdf_file.name)
40
-
41
  for item in extracted_data:
42
  all_data.append({
43
  'filename': os.path.basename(pdf_file.name),
@@ -58,31 +97,11 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
58
  # Only upload if the user selects it
59
  if action_choice in ["Upload to Hugging Face", "Both"]:
60
  try:
61
- api = HfApi()
62
- api.set_access_token(hf_token)
63
-
64
- # Validate the user's repo
65
- try:
66
- api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
67
- repo_exists = True
68
- except HfHubHTTPError:
69
- repo_exists = False
70
-
71
- if repo_exists:
72
- api.upload_file(
73
- path_or_fileobj=parquet_file,
74
- path_in_repo='papers_with_headers.parquet',
75
- repo_id=dataset_repo_id,
76
- repo_type='dataset'
77
- )
78
- upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
79
- else:
80
- upload_message = "❌ Dataset repo not found. Please check the repo ID."
81
-
82
  except Exception as e:
83
  upload_message = f"❌ Upload failed: {str(e)}"
84
 
85
- # Return the file for local download + upload status
86
  return parquet_file, upload_message
87
 
88
  # Gradio Interface
@@ -98,8 +117,9 @@ iface = gr.Interface(
98
  gr.File(label="Download Parquet File"),
99
  gr.Textbox(label="Status")
100
  ],
101
- title="PDF to Parquet Converter with User-Controlled Upload",
102
- description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
103
  )
104
 
105
  iface.launch()
 
 
2
  import pandas as pd
3
  import fitz # PyMuPDF
4
  import os
5
+ from huggingface_hub import HfApi, HfHubHTTPError
6
+ import requests
7
+ import time
8
 
9
+ def extract_paragraphs_with_headers(pdf_path, progress=None):
 
10
  doc = fitz.open(pdf_path)
11
  data = []
12
 
13
+ total_pages = len(doc)
14
  for page_num, page in enumerate(doc):
15
+ if progress:
16
+ progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
17
+
18
  blocks = page.get_text("dict")["blocks"]
19
  for block in blocks:
20
  if "lines" in block:
 
22
  for line in block["lines"]:
23
  for span in line["spans"]:
24
  text += span["text"] + " "
25
+
26
  text = text.strip()
27
 
28
  # Detect headers based on font size
 
36
 
37
  return data
38
 
39
+ def upload_with_progress(file_path, repo_id, token, progress):
40
+ """
41
+ Upload file to Hugging Face Dataset with progress tracking.
42
+ """
43
+ file_size = os.path.getsize(file_path)
44
+ url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
45
+
46
+ headers = {
47
+ "Authorization": f"Bearer {token}"
48
+ }
49
+
50
+ with open(file_path, 'rb') as f:
51
+ chunk_size = 1024 * 1024 # 1MB
52
+ uploaded = 0
53
+
54
+ while True:
55
+ chunk = f.read(chunk_size)
56
+ if not chunk:
57
+ break
58
+
59
+ response = requests.put(
60
+ url,
61
+ headers=headers,
62
+ data=chunk
63
+ )
64
+
65
+ uploaded += len(chunk)
66
+ progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
67
+ time.sleep(0.1) # Simulate delay for progress update
68
+
69
+ if response.status_code != 200:
70
+ raise Exception(f"Upload failed: {response.text}")
71
+
72
+ return f"✅ Successfully uploaded to {repo_id}"
73
+
74
+ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
75
  all_data = []
76
 
77
+ # Process each uploaded PDF
78
  for pdf_file in pdf_files:
79
+ extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
 
80
  for item in extracted_data:
81
  all_data.append({
82
  'filename': os.path.basename(pdf_file.name),
 
97
  # Only upload if the user selects it
98
  if action_choice in ["Upload to Hugging Face", "Both"]:
99
  try:
100
+ upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  except Exception as e:
102
  upload_message = f"❌ Upload failed: {str(e)}"
103
 
104
+ # Return Parquet file and status message
105
  return parquet_file, upload_message
106
 
107
  # Gradio Interface
 
117
  gr.File(label="Download Parquet File"),
118
  gr.Textbox(label="Status")
119
  ],
120
+ title="PDF to Parquet Converter with Upload Progress",
121
+ description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo with real-time progress tracking."
122
  )
123
 
124
  iface.launch()
125
+