Jobey1 commited on
Commit
dfa54c4
Β·
verified Β·
1 Parent(s): 06449e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -36
app.py CHANGED
@@ -4,7 +4,6 @@ import fitz # PyMuPDF
4
  import os
5
  from huggingface_hub import HfApi
6
  from huggingface_hub.utils import HfHubHTTPError
7
- import requests
8
  import time
9
 
10
  def extract_paragraphs_with_headers(pdf_path, progress=None):
@@ -48,47 +47,35 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
48
 
49
  def upload_with_progress(file_path, repo_id, token, progress):
50
  """
51
- Upload file to Hugging Face Dataset with progress tracking.
52
  """
53
  print(f"πŸ“€ Starting upload of Parquet: {file_path}")
54
  file_size = os.path.getsize(file_path)
55
- url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
56
 
57
- headers = {
58
- "Authorization": f"Bearer {token}"
59
- }
60
 
61
- with open(file_path, 'rb') as f:
62
- chunk_size = 1024 * 1024 # 1MB
63
- uploaded = 0
64
- max_chunks = file_size // chunk_size + 10 # Safety limit to avoid infinite loops
65
- chunk_count = 0
66
-
67
- while True:
68
- chunk = f.read(chunk_size)
69
- if not chunk:
70
- break
71
-
72
- response = requests.put(
73
- url,
74
- headers=headers,
75
- data=chunk
76
- )
77
-
78
- uploaded += len(chunk)
79
- if progress is not None:
80
- progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
81
- time.sleep(0.1) # Smooth progress update
82
 
83
- chunk_count += 1
84
- if chunk_count > max_chunks:
85
- raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")
86
 
87
- if response.status_code != 200:
88
- raise Exception(f"❌ Upload failed: {response.text}")
89
 
90
- print(f"βœ… Successfully uploaded to {repo_id}")
91
- return f"βœ… Successfully uploaded to {repo_id}"
 
 
 
 
92
 
93
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
94
  all_data = []
@@ -148,8 +135,8 @@ iface = gr.Interface(
148
  gr.File(label="Download Parquet File"),
149
  gr.Textbox(label="Status")
150
  ],
151
- title="PDF to Parquet Converter with Detailed Progress",
152
- description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
153
  )
154
 
155
  iface.launch()
 
4
  import os
5
  from huggingface_hub import HfApi
6
  from huggingface_hub.utils import HfHubHTTPError
 
7
  import time
8
 
9
  def extract_paragraphs_with_headers(pdf_path, progress=None):
 
47
 
48
  def upload_with_progress(file_path, repo_id, token, progress):
49
  """
50
+ Upload file to Hugging Face Dataset using upload_file() API method.
51
  """
52
  print(f"πŸ“€ Starting upload of Parquet: {file_path}")
53
  file_size = os.path.getsize(file_path)
 
54
 
55
+ api = HfApi()
 
 
56
 
57
+ try:
58
+ # Use upload_file() method from huggingface_hub
59
+ api.upload_file(
60
+ path_or_fileobj=file_path,
61
+ path_in_repo=os.path.basename(file_path),
62
+ repo_id=repo_id,
63
+ repo_type="dataset",
64
+ token=token
65
+ )
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ if progress is not None:
68
+ progress(1, desc="βœ… Upload Complete")
 
69
 
70
+ print(f"βœ… Successfully uploaded to {repo_id}")
71
+ return f"βœ… Successfully uploaded to {repo_id}"
72
 
73
+ except HfHubHTTPError as e:
74
+ print(f"❌ Upload failed: {e}")
75
+ return f"❌ Upload failed: {str(e)}"
76
+ except Exception as e:
77
+ print(f"❌ Unexpected error: {e}")
78
+ return f"❌ Unexpected error: {str(e)}"
79
 
80
  def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
81
  all_data = []
 
135
  gr.File(label="Download Parquet File"),
136
  gr.Textbox(label="Status")
137
  ],
138
+ title="PDF to Parquet Converter with Correct Upload API",
139
+ description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
140
  )
141
 
142
  iface.launch()