evijit HF Staff commited on
Commit
9784f64
·
verified ·
1 Parent(s): 9c451ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -78
app.py CHANGED
@@ -216,30 +216,42 @@ def create_treemap(treemap_data, count_by, title=None):
216
 
217
  def download_with_progress(url, progress=None):
218
  """Download a file with progress tracking"""
219
- response = requests.get(url, stream=True)
220
- total_size = int(response.headers.get('content-length', 0))
221
- block_size = 1024 # 1 Kibibyte
222
- data = BytesIO()
223
-
224
- if total_size == 0:
225
- # If content length is unknown, we can't show accurate progress
226
- if progress:
227
- progress(0, "Starting download...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
- for chunk in response.iter_content(block_size):
230
- data.write(chunk)
231
- if progress:
232
- progress(0, f"Downloading... (unknown size)")
233
- else:
234
- downloaded = 0
235
- for chunk in response.iter_content(block_size):
236
- downloaded += len(chunk)
237
- data.write(chunk)
238
- if progress:
239
- percent = int(100 * downloaded / total_size)
240
- progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
241
-
242
- return data.getvalue()
243
 
244
  def download_and_process_models(progress=None):
245
  """Download and process the models data from HuggingFace dataset with progress tracking"""
@@ -250,8 +262,7 @@ def download_and_process_models(progress=None):
250
 
251
  # Check if we have cached data
252
  if os.path.exists('data/processed_models.parquet'):
253
- if progress:
254
- progress(1.0, "Loading from cache...")
255
  print("Loading models from cache...")
256
  df = pd.read_parquet('data/processed_models.parquet')
257
  return df
@@ -259,65 +270,65 @@ def download_and_process_models(progress=None):
259
  # URL to the models.parquet file
260
  url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
261
 
262
- if progress:
263
- progress(0.0, "Starting download...")
264
  print(f"Downloading models data from {url}...")
265
 
266
- # Download with progress tracking
267
- file_content = download_with_progress(url, progress)
268
-
269
- if progress:
270
- progress(0.9, "Parsing parquet file...")
271
-
272
- # Read the parquet file
273
- table = pq.read_table(BytesIO(file_content))
274
- df = table.to_pandas()
275
-
276
- print(f"Downloaded {len(df)} models")
277
-
278
- if progress:
279
- progress(0.95, "Processing data...")
280
-
281
- # Process the safetensors column if it's a string (JSON)
282
- if 'safetensors' in df.columns:
283
- def parse_safetensors(val):
284
- if isinstance(val, str):
285
- try:
286
- return json.loads(val)
287
- except:
288
- return None
289
- return val
290
-
291
- df['safetensors'] = df['safetensors'].apply(parse_safetensors)
292
-
293
- # Process the tags column if needed
294
- if 'tags' in df.columns and not isinstance(df['tags'].iloc[0], list):
295
- def parse_tags(val):
296
- if isinstance(val, str):
297
- try:
298
- return json.loads(val)
299
- except:
300
- return []
301
- return val if isinstance(val, list) else []
302
-
303
- df['tags'] = df['tags'].apply(parse_tags)
304
-
305
- # Cache the processed data
306
- if progress:
307
- progress(0.98, "Saving to cache...")
308
- df.to_parquet('data/processed_models.parquet')
309
-
310
- if progress:
311
- progress(1.0, "Data ready!")
312
-
313
- return df
 
 
314
 
315
  except Exception as e:
316
  print(f"Error downloading or processing data: {e}")
317
- if progress:
318
- progress(1.0, "Using sample data (download failed)")
319
  # Return sample data for testing if real data unavailable
320
- return create_sample_data()
321
 
322
  def create_sample_data(progress=None):
323
  """Create sample data for testing when real data is unavailable"""
 
216
 
217
  def download_with_progress(url, progress=None):
218
  """Download a file with progress tracking"""
219
+ try:
220
+ response = requests.get(url, stream=True)
221
+ total_size = int(response.headers.get('content-length', 0))
222
+ block_size = 1024 # 1 Kibibyte
223
+ data = BytesIO()
224
+
225
+ if total_size == 0:
226
+ # If content length is unknown, we can't show accurate progress
227
+ if progress is not None:
228
+ progress(0, "Starting download...")
229
+
230
+ for chunk in response.iter_content(block_size):
231
+ data.write(chunk)
232
+ if progress is not None:
233
+ progress(0, f"Downloading... (unknown size)")
234
+ else:
235
+ downloaded = 0
236
+ for chunk in response.iter_content(block_size):
237
+ downloaded += len(chunk)
238
+ data.write(chunk)
239
+ if progress is not None:
240
+ percent = int(100 * downloaded / total_size)
241
+ progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
242
 
243
+ return data.getvalue()
244
+ except Exception as e:
245
+ print(f"Error in download_with_progress: {e}")
246
+ raise
247
+
248
+ def update_progress(progress_obj, value, description):
249
+ """Safely update progress with error handling"""
250
+ try:
251
+ if progress_obj is not None:
252
+ progress_obj(value, description)
253
+ except Exception as e:
254
+ print(f"Error updating progress: {e}")
 
 
255
 
256
  def download_and_process_models(progress=None):
257
  """Download and process the models data from HuggingFace dataset with progress tracking"""
 
262
 
263
  # Check if we have cached data
264
  if os.path.exists('data/processed_models.parquet'):
265
+ update_progress(progress, 1.0, "Loading from cache...")
 
266
  print("Loading models from cache...")
267
  df = pd.read_parquet('data/processed_models.parquet')
268
  return df
 
270
  # URL to the models.parquet file
271
  url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
272
 
273
+ update_progress(progress, 0.0, "Starting download...")
 
274
  print(f"Downloading models data from {url}...")
275
 
276
+ try:
277
+ # Download with progress tracking
278
+ file_content = download_with_progress(url, progress)
279
+
280
+ update_progress(progress, 0.9, "Parsing parquet file...")
281
+
282
+ # Read the parquet file
283
+ table = pq.read_table(BytesIO(file_content))
284
+ df = table.to_pandas()
285
+
286
+ print(f"Downloaded {len(df)} models")
287
+
288
+ update_progress(progress, 0.95, "Processing data...")
289
+
290
+ # Process the safetensors column if it's a string (JSON)
291
+ if 'safetensors' in df.columns:
292
+ def parse_safetensors(val):
293
+ if isinstance(val, str):
294
+ try:
295
+ return json.loads(val)
296
+ except:
297
+ return None
298
+ return val
299
+
300
+ df['safetensors'] = df['safetensors'].apply(parse_safetensors)
301
+
302
+ # Process the tags column if needed
303
+ if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
304
+ def parse_tags(val):
305
+ if isinstance(val, str):
306
+ try:
307
+ return json.loads(val)
308
+ except:
309
+ return []
310
+ return val if isinstance(val, list) else []
311
+
312
+ df['tags'] = df['tags'].apply(parse_tags)
313
+
314
+ # Cache the processed data
315
+ update_progress(progress, 0.98, "Saving to cache...")
316
+ df.to_parquet('data/processed_models.parquet')
317
+
318
+ update_progress(progress, 1.0, "Data ready!")
319
+
320
+ return df
321
+
322
+ except Exception as download_error:
323
+ print(f"Download failed: {download_error}")
324
+ update_progress(progress, 0.5, "Download failed, generating sample data...")
325
+ return create_sample_data(progress)
326
 
327
  except Exception as e:
328
  print(f"Error downloading or processing data: {e}")
329
+ update_progress(progress, 1.0, "Using sample data (error occurred)")
 
330
  # Return sample data for testing if real data unavailable
331
+ return create_sample_data(progress)
332
 
333
  def create_sample_data(progress=None):
334
  """Create sample data for testing when real data is unavailable"""