m7n commited on
Commit
1431c60
·
1 Parent(s): 084d257

Add retry mechanism and local publication year approximation to data processing

Browse files
Files changed (1) hide show
  1. app.py +29 -9
app.py CHANGED
@@ -27,7 +27,7 @@ import colormaps
27
  import matplotlib.colors as mcolors
28
  from matplotlib.colors import Normalize
29
 
30
-
31
 
32
  import opinionated # for fonts
33
  plt.style.use("opinionated_rc")
@@ -254,15 +254,33 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
254
 
255
  should_break = False
256
  for page in query.paginate(per_page=200, n_max=None):
257
- for record in page:
258
- records.append(record)
259
- records_per_query += 1
260
- progress(0.1 + (0.2 * len(records) / (total_query_length)),
261
- desc=f"Getting data from query {i+1}/{len(urls)}...")
262
-
263
- if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
264
- should_break = True
 
 
 
 
 
 
 
 
 
265
  break
 
 
 
 
 
 
 
 
 
266
  if should_break:
267
  break
268
  if should_break:
@@ -411,6 +429,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
411
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
412
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
413
  export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
 
 
414
  export_df.to_csv(csv_file_path, index=False)
415
 
416
  if download_png_checkbox:
 
27
  import matplotlib.colors as mcolors
28
  from matplotlib.colors import Normalize
29
 
30
+ import random
31
 
32
  import opinionated # for fonts
33
  plt.style.use("opinionated_rc")
 
254
 
255
  should_break = False
256
  for page in query.paginate(per_page=200, n_max=None):
257
+ # Add retry mechanism for processing each page
258
+ max_retries = 5
259
+ base_wait_time = 1 # Starting wait time in seconds
260
+ exponent = 1.5 # Exponential factor
261
+
262
+ for retry_attempt in range(max_retries):
263
+ try:
264
+ for record in page:
265
+ records.append(record)
266
+ records_per_query += 1
267
+ progress(0.1 + (0.2 * len(records) / (total_query_length)),
268
+ desc=f"Getting data from query {i+1}/{len(urls)}...")
269
+
270
+ if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
271
+ should_break = True
272
+ break
273
+ # If we get here without an exception, break the retry loop
274
  break
275
+ except Exception as e:
276
+ print(f"Error processing page: {e}")
277
+ if retry_attempt < max_retries - 1:
278
+ wait_time = base_wait_time * (exponent ** retry_attempt) + random.random()
279
+ print(f"Retrying in {wait_time:.2f} seconds (attempt {retry_attempt + 1}/{max_retries})...")
280
+ time.sleep(wait_time)
281
+ else:
282
+ print(f"Maximum retries reached. Continuing with next page.")
283
+
284
  if should_break:
285
  break
286
  if should_break:
 
429
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
430
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
431
  export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
432
+ if locally_approximate_publication_date_checkbox:
433
+ export_df['approximate_publication_year'] = local_years
434
  export_df.to_csv(csv_file_path, index=False)
435
 
436
  if download_png_checkbox: