Update app.py
Browse files
app.py
CHANGED
@@ -216,30 +216,42 @@ def create_treemap(treemap_data, count_by, title=None):
|
|
216 |
|
217 |
def download_with_progress(url, progress=None):
|
218 |
"""Download a file with progress tracking"""
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
return data.getvalue()
|
243 |
|
244 |
def download_and_process_models(progress=None):
|
245 |
"""Download and process the models data from HuggingFace dataset with progress tracking"""
|
@@ -250,8 +262,7 @@ def download_and_process_models(progress=None):
|
|
250 |
|
251 |
# Check if we have cached data
|
252 |
if os.path.exists('data/processed_models.parquet'):
|
253 |
-
|
254 |
-
progress(1.0, "Loading from cache...")
|
255 |
print("Loading models from cache...")
|
256 |
df = pd.read_parquet('data/processed_models.parquet')
|
257 |
return df
|
@@ -259,65 +270,65 @@ def download_and_process_models(progress=None):
|
|
259 |
# URL to the models.parquet file
|
260 |
url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
|
261 |
|
262 |
-
|
263 |
-
progress(0.0, "Starting download...")
|
264 |
print(f"Downloading models data from {url}...")
|
265 |
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
progress
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
314 |
|
315 |
except Exception as e:
|
316 |
print(f"Error downloading or processing data: {e}")
|
317 |
-
|
318 |
-
progress(1.0, "Using sample data (download failed)")
|
319 |
# Return sample data for testing if real data unavailable
|
320 |
-
return create_sample_data()
|
321 |
|
322 |
def create_sample_data(progress=None):
|
323 |
"""Create sample data for testing when real data is unavailable"""
|
|
|
216 |
|
217 |
def download_with_progress(url, progress=None):
|
218 |
"""Download a file with progress tracking"""
|
219 |
+
try:
|
220 |
+
response = requests.get(url, stream=True)
|
221 |
+
total_size = int(response.headers.get('content-length', 0))
|
222 |
+
block_size = 1024 # 1 Kibibyte
|
223 |
+
data = BytesIO()
|
224 |
+
|
225 |
+
if total_size == 0:
|
226 |
+
# If content length is unknown, we can't show accurate progress
|
227 |
+
if progress is not None:
|
228 |
+
progress(0, "Starting download...")
|
229 |
+
|
230 |
+
for chunk in response.iter_content(block_size):
|
231 |
+
data.write(chunk)
|
232 |
+
if progress is not None:
|
233 |
+
progress(0, f"Downloading... (unknown size)")
|
234 |
+
else:
|
235 |
+
downloaded = 0
|
236 |
+
for chunk in response.iter_content(block_size):
|
237 |
+
downloaded += len(chunk)
|
238 |
+
data.write(chunk)
|
239 |
+
if progress is not None:
|
240 |
+
percent = int(100 * downloaded / total_size)
|
241 |
+
progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
|
242 |
|
243 |
+
return data.getvalue()
|
244 |
+
except Exception as e:
|
245 |
+
print(f"Error in download_with_progress: {e}")
|
246 |
+
raise
|
247 |
+
|
248 |
+
def update_progress(progress_obj, value, description):
|
249 |
+
"""Safely update progress with error handling"""
|
250 |
+
try:
|
251 |
+
if progress_obj is not None:
|
252 |
+
progress_obj(value, description)
|
253 |
+
except Exception as e:
|
254 |
+
print(f"Error updating progress: {e}")
|
|
|
|
|
255 |
|
256 |
def download_and_process_models(progress=None):
|
257 |
"""Download and process the models data from HuggingFace dataset with progress tracking"""
|
|
|
262 |
|
263 |
# Check if we have cached data
|
264 |
if os.path.exists('data/processed_models.parquet'):
|
265 |
+
update_progress(progress, 1.0, "Loading from cache...")
|
|
|
266 |
print("Loading models from cache...")
|
267 |
df = pd.read_parquet('data/processed_models.parquet')
|
268 |
return df
|
|
|
270 |
# URL to the models.parquet file
|
271 |
url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
|
272 |
|
273 |
+
update_progress(progress, 0.0, "Starting download...")
|
|
|
274 |
print(f"Downloading models data from {url}...")
|
275 |
|
276 |
+
try:
|
277 |
+
# Download with progress tracking
|
278 |
+
file_content = download_with_progress(url, progress)
|
279 |
+
|
280 |
+
update_progress(progress, 0.9, "Parsing parquet file...")
|
281 |
+
|
282 |
+
# Read the parquet file
|
283 |
+
table = pq.read_table(BytesIO(file_content))
|
284 |
+
df = table.to_pandas()
|
285 |
+
|
286 |
+
print(f"Downloaded {len(df)} models")
|
287 |
+
|
288 |
+
update_progress(progress, 0.95, "Processing data...")
|
289 |
+
|
290 |
+
# Process the safetensors column if it's a string (JSON)
|
291 |
+
if 'safetensors' in df.columns:
|
292 |
+
def parse_safetensors(val):
|
293 |
+
if isinstance(val, str):
|
294 |
+
try:
|
295 |
+
return json.loads(val)
|
296 |
+
except:
|
297 |
+
return None
|
298 |
+
return val
|
299 |
+
|
300 |
+
df['safetensors'] = df['safetensors'].apply(parse_safetensors)
|
301 |
+
|
302 |
+
# Process the tags column if needed
|
303 |
+
if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
|
304 |
+
def parse_tags(val):
|
305 |
+
if isinstance(val, str):
|
306 |
+
try:
|
307 |
+
return json.loads(val)
|
308 |
+
except:
|
309 |
+
return []
|
310 |
+
return val if isinstance(val, list) else []
|
311 |
+
|
312 |
+
df['tags'] = df['tags'].apply(parse_tags)
|
313 |
+
|
314 |
+
# Cache the processed data
|
315 |
+
update_progress(progress, 0.98, "Saving to cache...")
|
316 |
+
df.to_parquet('data/processed_models.parquet')
|
317 |
+
|
318 |
+
update_progress(progress, 1.0, "Data ready!")
|
319 |
+
|
320 |
+
return df
|
321 |
+
|
322 |
+
except Exception as download_error:
|
323 |
+
print(f"Download failed: {download_error}")
|
324 |
+
update_progress(progress, 0.5, "Download failed, generating sample data...")
|
325 |
+
return create_sample_data(progress)
|
326 |
|
327 |
except Exception as e:
|
328 |
print(f"Error downloading or processing data: {e}")
|
329 |
+
update_progress(progress, 1.0, "Using sample data (error occurred)")
|
|
|
330 |
# Return sample data for testing if real data unavailable
|
331 |
+
return create_sample_data(progress)
|
332 |
|
333 |
def create_sample_data(progress=None):
|
334 |
"""Create sample data for testing when real data is unavailable"""
|