Spaces:
Sleeping
Sleeping
Add retry mechanism and local publication year approximation to data processing
Browse files
app.py
CHANGED
@@ -27,7 +27,7 @@ import colormaps
|
|
27 |
import matplotlib.colors as mcolors
|
28 |
from matplotlib.colors import Normalize
|
29 |
|
30 |
-
|
31 |
|
32 |
import opinionated # for fonts
|
33 |
plt.style.use("opinionated_rc")
|
@@ -254,15 +254,33 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
254 |
|
255 |
should_break = False
|
256 |
for page in query.paginate(per_page=200, n_max=None):
|
257 |
-
for
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
if should_break:
|
267 |
break
|
268 |
if should_break:
|
@@ -411,6 +429,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
411 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
412 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
413 |
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
|
|
|
|
414 |
export_df.to_csv(csv_file_path, index=False)
|
415 |
|
416 |
if download_png_checkbox:
|
|
|
27 |
import matplotlib.colors as mcolors
|
28 |
from matplotlib.colors import Normalize
|
29 |
|
30 |
+
import random
|
31 |
|
32 |
import opinionated # for fonts
|
33 |
plt.style.use("opinionated_rc")
|
|
|
254 |
|
255 |
should_break = False
|
256 |
for page in query.paginate(per_page=200, n_max=None):
|
257 |
+
# Add retry mechanism for processing each page
|
258 |
+
max_retries = 5
|
259 |
+
base_wait_time = 1 # Starting wait time in seconds
|
260 |
+
exponent = 1.5 # Exponential factor
|
261 |
+
|
262 |
+
for retry_attempt in range(max_retries):
|
263 |
+
try:
|
264 |
+
for record in page:
|
265 |
+
records.append(record)
|
266 |
+
records_per_query += 1
|
267 |
+
progress(0.1 + (0.2 * len(records) / (total_query_length)),
|
268 |
+
desc=f"Getting data from query {i+1}/{len(urls)}...")
|
269 |
+
|
270 |
+
if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
|
271 |
+
should_break = True
|
272 |
+
break
|
273 |
+
# If we get here without an exception, break the retry loop
|
274 |
break
|
275 |
+
except Exception as e:
|
276 |
+
print(f"Error processing page: {e}")
|
277 |
+
if retry_attempt < max_retries - 1:
|
278 |
+
wait_time = base_wait_time * (exponent ** retry_attempt) + random.random()
|
279 |
+
print(f"Retrying in {wait_time:.2f} seconds (attempt {retry_attempt + 1}/{max_retries})...")
|
280 |
+
time.sleep(wait_time)
|
281 |
+
else:
|
282 |
+
print(f"Maximum retries reached. Continuing with next page.")
|
283 |
+
|
284 |
if should_break:
|
285 |
break
|
286 |
if should_break:
|
|
|
429 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
430 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
431 |
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
432 |
+
if locally_approximate_publication_date_checkbox:
|
433 |
+
export_df['approximate_publication_year'] = local_years
|
434 |
export_df.to_csv(csv_file_path, index=False)
|
435 |
|
436 |
if download_png_checkbox:
|