vargha commited on
Commit
bdb3c28
Β·
1 Parent(s): 26217ec

resume point and pagination

Browse files
Files changed (1) hide show
  1. components/review_dashboard_page.py +180 -106
components/review_dashboard_page.py CHANGED
@@ -3,7 +3,7 @@
3
  import gradio as gr
4
  import datetime
5
  import sentry_sdk
6
- from sqlalchemy import orm, func, or_
7
 
8
  from components.header import Header
9
  from utils.logger import Logger
@@ -12,6 +12,7 @@ from config import conf
12
  from utils.database import get_db
13
  from data.models import Annotation, TTSData, Annotator, Validation, AnnotationInterval
14
  from data.repository.annotator_workload_repo import AnnotatorWorkloadRepo
 
15
 
16
  log = Logger()
17
  LOADER = CloudServerAudioLoader(conf.FTP_URL)
@@ -205,27 +206,44 @@ class ReviewDashboardPage:
205
  if not target_annotator_obj:
206
  return f"⚠️ **Error:** Annotator '{target_annotator}' not found"
207
 
208
- # Restrict to assigned intervals, if any
209
- intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
210
- interval_filters = []
211
- for iv in intervals:
212
- if iv.start_index is not None and iv.end_index is not None:
213
- interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
214
 
215
- base_filters = [Annotation.annotator_id == target_annotator_obj.id]
216
- if interval_filters:
217
- base_filters.append(or_(*interval_filters))
218
 
219
- # Count total annotations for target annotator within intervals (if defined)
220
- total_count = db.query(Annotation).filter(*base_filters).count()
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- # Count reviewed annotations (have validation from this reviewer)
223
- reviewed_count = db.query(Annotation).join(
224
- Validation, Annotation.id == Validation.annotation_id
225
- ).filter(
226
- *base_filters,
227
- Validation.validator_id == user_id
228
- ).count()
 
 
 
 
 
 
 
 
 
229
 
230
  if total_count > 0:
231
  percentage = (reviewed_count / total_count) * 100
@@ -321,87 +339,111 @@ class ReviewDashboardPage:
321
 
322
  log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
323
 
324
- # Build interval filters (restrict to annotator's assigned splits)
325
- intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
326
- interval_filters = []
327
- for iv in intervals:
328
- if iv.start_index is not None and iv.end_index is not None:
329
- interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
330
 
331
- base_filters = [Annotation.annotator_id == target_annotator_obj.id]
332
- if interval_filters:
333
- base_filters.append(or_(*interval_filters))
334
-
335
- # Reduced batch size for instant loading
336
- INITIAL_BATCH_SIZE = 5
337
 
338
- # Count total annotations within assigned range
339
- total_count = db.query(Annotation).filter(*base_filters).count()
 
340
 
341
- # Determine resume position based on last reviewed data id within range
342
  all_reviewed = False
343
- last_reviewed_tts_id = db.query(func.max(Annotation.tts_data_id)).join(
344
- Validation, Validation.annotation_id == Annotation.id
345
- ).filter(
346
- *base_filters,
347
- Validation.validator_id == user_id
348
- ).scalar()
349
-
350
- start_tts_id = None
351
- if last_reviewed_tts_id is not None:
352
- next_row = db.query(Annotation.tts_data_id).filter(
353
- *base_filters,
354
- Annotation.tts_data_id > last_reviewed_tts_id
355
- ).order_by(Annotation.tts_data_id.asc()).first()
356
- if next_row is not None:
357
- start_tts_id = next_row[0]
358
- else:
359
- all_reviewed = True
360
- else:
361
- # No reviews yet: start at the first item in range
362
- first_row = db.query(func.min(Annotation.tts_data_id)).filter(*base_filters).scalar()
363
- start_tts_id = first_row
364
-
365
- # Compute start offset
366
- start_offset = 0
367
- if start_tts_id is not None:
368
- start_offset = db.query(Annotation).filter(
369
- *base_filters,
370
- Annotation.tts_data_id < start_tts_id
 
 
 
 
 
 
 
 
 
 
371
  ).count()
372
- else:
373
- # no items in range
374
- start_offset = 0
375
-
376
- # Query initial batch ordered by tts_data_id (data id)
377
- initial_query = db.query(
378
- Annotation,
379
- TTSData.filename,
380
- TTSData.sentence
381
- ).join(
382
- TTSData, Annotation.tts_data_id == TTSData.id
383
- ).filter(
384
- *base_filters
385
- ).order_by(Annotation.tts_data_id.asc()).offset(start_offset).limit(INITIAL_BATCH_SIZE)
386
 
387
- initial_results = initial_query.all()
388
-
389
- # If everything is reviewed or resume window empty, load the last batch so user can still browse
390
- if (not initial_results and total_count > 0) or all_reviewed:
391
- fallback_offset = max(total_count - INITIAL_BATCH_SIZE, 0)
392
- initial_results = db.query(
393
  Annotation,
394
  TTSData.filename,
395
  TTSData.sentence
396
  ).join(
397
  TTSData, Annotation.tts_data_id == TTSData.id
 
 
 
 
 
 
 
398
  ).filter(
399
- *base_filters
400
- ).order_by(Annotation.tts_data_id.asc()).offset(fallback_offset).limit(INITIAL_BATCH_SIZE).all()
401
- start_offset = fallback_offset
402
- all_reviewed = True # Ensure we set this so we start at the end of the batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- log.info(f"Fast initial load (offset {start_offset}): {len(initial_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
405
 
406
  # Process items with minimal data - validation status will be loaded on-demand
407
  items = []
@@ -423,9 +465,11 @@ class ReviewDashboardPage:
423
  })
424
 
425
  # Determine initial index inside the loaded batch
 
 
426
  initial_idx = 0
427
  if items and all_reviewed:
428
- initial_idx = len(items) - 1
429
 
430
  # Set initial display
431
  if items:
@@ -462,6 +506,13 @@ class ReviewDashboardPage:
462
  # Ensure correct order and number of return values for empty items (14 outputs)
463
  return [], 0, f"πŸ” **Phase 2 Review Mode** - No annotations found for review.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
464
 
 
 
 
 
 
 
 
465
  def show_current_review_item_fn(items, idx, session):
466
  if not items or idx >= len(items) or idx < 0:
467
  # tts_id, filename, sentence, ann_sentence, annotated_at, validation_status, annotator_name_placeholder, audio_update, rejection_reason_update, rejection_mode_reset, btn_reject_update
@@ -597,6 +648,13 @@ class ReviewDashboardPage:
597
 
598
  return items, items[idx]["validation_status"], rejection_input_update
599
 
 
 
 
 
 
 
 
600
  def handle_rejection_fn(items, idx, session, rejection_reason, rejection_mode_active):
601
  """Handle rejection button click - two-step process"""
602
  if not items or idx >= len(items):
@@ -675,33 +733,49 @@ class ReviewDashboardPage:
675
  if not target_annotator_obj:
676
  return items, 0
677
 
678
- # Restrict to assigned intervals
679
- intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
680
- interval_filters = []
681
- for iv in intervals:
682
- if iv.start_index is not None and iv.end_index is not None:
683
- interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
684
- base_filters = [Annotation.annotator_id == target_annotator_obj.id]
685
- if interval_filters:
686
- base_filters.append(or_(*interval_filters))
687
 
688
- # Get total count for updated review info
689
- total_count = db.query(Annotation).filter(*base_filters).count()
 
 
 
 
 
 
 
 
 
 
 
690
 
691
- # Determine the next window based on the last loaded tts_data_id
692
- last_loaded_tts_id = items[-1]["tts_id"] if items else 0
693
 
694
- # Use tts_data_id-based pagination to continue from current position
695
  query = db.query(
696
  Annotation,
697
  TTSData.filename,
698
  TTSData.sentence
699
  ).join(
700
  TTSData, Annotation.tts_data_id == TTSData.id
 
 
 
 
 
 
 
701
  ).filter(
702
- *base_filters,
703
- Annotation.tts_data_id > last_loaded_tts_id
704
- ).order_by(Annotation.tts_data_id.asc()).limit(current_batch_size)
705
 
706
  results = query.all()
707
 
@@ -726,7 +800,7 @@ class ReviewDashboardPage:
726
 
727
  # Combine with existing items
728
  all_items = items + new_items
729
- log.info(f"Loaded {len(new_items)} more items after data id {last_loaded_tts_id}, total now: {len(all_items)}")
730
  return all_items, total_count
731
 
732
  # Output definitions
 
3
  import gradio as gr
4
  import datetime
5
  import sentry_sdk
6
+ from sqlalchemy import orm
7
 
8
  from components.header import Header
9
  from utils.logger import Logger
 
12
  from utils.database import get_db
13
  from data.models import Annotation, TTSData, Annotator, Validation, AnnotationInterval
14
  from data.repository.annotator_workload_repo import AnnotatorWorkloadRepo
15
+ from sqlalchemy import and_
16
 
17
  log = Logger()
18
  LOADER = CloudServerAudioLoader(conf.FTP_URL)
 
206
  if not target_annotator_obj:
207
  return f"⚠️ **Error:** Annotator '{target_annotator}' not found"
208
 
209
+ # Get the target annotator's assigned intervals
210
+ assigned_intervals = db.query(AnnotationInterval).filter(
211
+ AnnotationInterval.annotator_id == target_annotator_obj.id
212
+ ).all()
 
 
213
 
214
+ if not assigned_intervals:
215
+ return f"⚠️ **Error:** No assigned intervals for annotator '{target_annotator}'"
 
216
 
217
+ # Count total annotations within assigned intervals for target annotator
218
+ total_count = 0
219
+ for interval in assigned_intervals:
220
+ if interval.start_index is None or interval.end_index is None:
221
+ continue
222
+ interval_count = db.query(Annotation).join(
223
+ TTSData, Annotation.tts_data_id == TTSData.id
224
+ ).filter(
225
+ Annotation.annotator_id == target_annotator_obj.id,
226
+ TTSData.id >= interval.start_index,
227
+ TTSData.id <= interval.end_index
228
+ ).count()
229
+ total_count += interval_count
230
 
231
+ # Count reviewed annotations within assigned intervals (have validation from this reviewer)
232
+ reviewed_count = 0
233
+ for interval in assigned_intervals:
234
+ if interval.start_index is None or interval.end_index is None:
235
+ continue
236
+ interval_reviewed = db.query(Annotation).join(
237
+ TTSData, Annotation.tts_data_id == TTSData.id
238
+ ).join(
239
+ Validation, Annotation.id == Validation.annotation_id
240
+ ).filter(
241
+ Annotation.annotator_id == target_annotator_obj.id,
242
+ TTSData.id >= interval.start_index,
243
+ TTSData.id <= interval.end_index,
244
+ Validation.validator_id == user_id
245
+ ).count()
246
+ reviewed_count += interval_reviewed
247
 
248
  if total_count > 0:
249
  percentage = (reviewed_count / total_count) * 100
 
339
 
340
  log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
341
 
342
+ # FAST INITIAL QUERY: Load only essential data without complex validation processing
343
+ # Reduced batch size for instant loading in HuggingFace spaces
344
+ INITIAL_BATCH_SIZE = 5 # Load only 5 items initially for instant response
 
 
 
345
 
346
+ # Get the target annotator's assigned intervals
347
+ assigned_intervals = db.query(AnnotationInterval).filter(
348
+ AnnotationInterval.annotator_id == target_annotator_obj.id
349
+ ).all()
 
 
350
 
351
+ if not assigned_intervals:
352
+ log.warning(f"No assigned intervals found for annotator {target_annotator}")
353
+ return [], 0, f"Review Target Error: No assigned intervals for annotator '{target_annotator}'.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
354
 
355
+ # Find the first UNREVIEWED annotation within assigned intervals for this reviewer
356
  all_reviewed = False
357
+ first_unreviewed_tts_id = None
358
+
359
+ # Query for the first TTS data ID within assigned intervals that has no validation by this reviewer
360
+ for interval in assigned_intervals:
361
+ if interval.start_index is None or interval.end_index is None:
362
+ continue
363
+
364
+ unreviewed_query = db.query(TTSData.id).join(
365
+ Annotation, Annotation.tts_data_id == TTSData.id
366
+ ).outerjoin(
367
+ Validation,
368
+ (Validation.annotation_id == Annotation.id) & (Validation.validator_id == user_id)
369
+ ).filter(
370
+ Annotation.annotator_id == target_annotator_obj.id,
371
+ TTSData.id >= interval.start_index,
372
+ TTSData.id <= interval.end_index,
373
+ Validation.id.is_(None) # No validation by this reviewer (fixed SQLAlchemy syntax)
374
+ ).order_by(TTSData.id.asc()).first()
375
+
376
+ if unreviewed_query:
377
+ first_unreviewed_tts_id = unreviewed_query[0]
378
+ break
379
+
380
+ if first_unreviewed_tts_id is None:
381
+ # Everything reviewed: flag and we will load the last batch from the last interval
382
+ all_reviewed = True
383
+
384
+ # Count total annotations within assigned intervals for progress info
385
+ total_count = 0
386
+ for interval in assigned_intervals:
387
+ if interval.start_index is None or interval.end_index is None:
388
+ continue
389
+ interval_count = db.query(Annotation).join(
390
+ TTSData, Annotation.tts_data_id == TTSData.id
391
+ ).filter(
392
+ Annotation.annotator_id == target_annotator_obj.id,
393
+ TTSData.id >= interval.start_index,
394
+ TTSData.id <= interval.end_index
395
  ).count()
396
+ total_count += interval_count
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ # Query to get annotations starting from the first unreviewed item
399
+ if not all_reviewed and first_unreviewed_tts_id:
400
+ # Load from first unreviewed TTS ID
401
+ initial_query = db.query(
 
 
402
  Annotation,
403
  TTSData.filename,
404
  TTSData.sentence
405
  ).join(
406
  TTSData, Annotation.tts_data_id == TTSData.id
407
+ ).join(
408
+ AnnotationInterval,
409
+ and_(
410
+ AnnotationInterval.annotator_id == target_annotator_obj.id,
411
+ TTSData.id >= AnnotationInterval.start_index,
412
+ TTSData.id <= AnnotationInterval.end_index
413
+ )
414
  ).filter(
415
+ Annotation.annotator_id == target_annotator_obj.id,
416
+ TTSData.id >= first_unreviewed_tts_id
417
+ ).order_by(TTSData.id).limit(INITIAL_BATCH_SIZE)
418
+
419
+ initial_results = initial_query.all()
420
+ else:
421
+ # Everything reviewed or no unreviewed items: load the last batch from assigned intervals
422
+ all_reviewed = True
423
+ if assigned_intervals and total_count > 0:
424
+ # Find the last interval and load the last batch from there
425
+ last_interval = max(assigned_intervals, key=lambda x: x.end_index or 0)
426
+ if last_interval.start_index is not None and last_interval.end_index is not None:
427
+ initial_query = db.query(
428
+ Annotation,
429
+ TTSData.filename,
430
+ TTSData.sentence
431
+ ).join(
432
+ TTSData, Annotation.tts_data_id == TTSData.id
433
+ ).filter(
434
+ Annotation.annotator_id == target_annotator_obj.id,
435
+ TTSData.id >= last_interval.start_index,
436
+ TTSData.id <= last_interval.end_index
437
+ ).order_by(TTSData.id.desc()).limit(INITIAL_BATCH_SIZE)
438
+
439
+ initial_results = initial_query.all()
440
+ initial_results.reverse() # Restore ascending order
441
+ else:
442
+ initial_results = []
443
+ else:
444
+ initial_results = []
445
 
446
+ log.info(f"Fast initial load: {len(initial_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
447
 
448
  # Process items with minimal data - validation status will be loaded on-demand
449
  items = []
 
465
  })
466
 
467
  # Determine initial index inside the loaded batch
468
+ # - Normal case (has unreviewed): start at 0 (first unreviewed)
469
+ # - All reviewed: start at last item in the batch for browsing
470
  initial_idx = 0
471
  if items and all_reviewed:
472
+ initial_idx = len(items) - 1
473
 
474
  # Set initial display
475
  if items:
 
506
  # Ensure correct order and number of return values for empty items (14 outputs)
507
  return [], 0, f"πŸ” **Phase 2 Review Mode** - No annotations found for review.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
508
 
509
+ # except Exception as e:
510
+ # log.error(f"Error loading review items: {e}")
511
+ # sentry_sdk.capture_exception(e)
512
+ # gr.Error(f"Failed to load review data: {e}")
513
+ # # Ensure correct order and number of return values for error case (14 outputs)
514
+ # return [], 0, "", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
515
+
516
  def show_current_review_item_fn(items, idx, session):
517
  if not items or idx >= len(items) or idx < 0:
518
  # tts_id, filename, sentence, ann_sentence, annotated_at, validation_status, annotator_name_placeholder, audio_update, rejection_reason_update, rejection_mode_reset, btn_reject_update
 
648
 
649
  return items, items[idx]["validation_status"], rejection_input_update
650
 
651
+ # except Exception as e:
652
+ # db.rollback()
653
+ # log.error(f"Error saving validation: {e}")
654
+ # sentry_sdk.capture_exception(e)
655
+ # gr.Error(f"Failed to save validation: {e}")
656
+ # return items, current_item["validation_status"], gr.update(visible=False) # Return original status and hide input on error
657
+
658
  def handle_rejection_fn(items, idx, session, rejection_reason, rejection_mode_active):
659
  """Handle rejection button click - two-step process"""
660
  if not items or idx >= len(items):
 
733
  if not target_annotator_obj:
734
  return items, 0
735
 
736
+ # Get the target annotator's assigned intervals
737
+ assigned_intervals = db.query(AnnotationInterval).filter(
738
+ AnnotationInterval.annotator_id == target_annotator_obj.id
739
+ ).all()
740
+
741
+ if not assigned_intervals:
742
+ return items, 0
 
 
743
 
744
+ # Count total annotations within assigned intervals for progress info
745
+ total_count = 0
746
+ for interval in assigned_intervals:
747
+ if interval.start_index is None or interval.end_index is None:
748
+ continue
749
+ interval_count = db.query(Annotation).join(
750
+ TTSData, Annotation.tts_data_id == TTSData.id
751
+ ).filter(
752
+ Annotation.annotator_id == target_annotator_obj.id,
753
+ TTSData.id >= interval.start_index,
754
+ TTSData.id <= interval.end_index
755
+ ).count()
756
+ total_count += interval_count
757
 
758
+ # Determine the next window based on the last loaded annotation id
759
+ last_loaded_id = items[-1]["annotation_id"] if items else 0
760
 
761
+ # FAST LOADING: Use id-based pagination within assigned intervals to continue from current position
762
  query = db.query(
763
  Annotation,
764
  TTSData.filename,
765
  TTSData.sentence
766
  ).join(
767
  TTSData, Annotation.tts_data_id == TTSData.id
768
+ ).join(
769
+ AnnotationInterval,
770
+ and_(
771
+ AnnotationInterval.annotator_id == target_annotator_obj.id,
772
+ TTSData.id >= AnnotationInterval.start_index,
773
+ TTSData.id <= AnnotationInterval.end_index
774
+ )
775
  ).filter(
776
+ Annotation.annotator_id == target_annotator_obj.id,
777
+ Annotation.id > last_loaded_id
778
+ ).order_by(Annotation.id).limit(current_batch_size)
779
 
780
  results = query.all()
781
 
 
800
 
801
  # Combine with existing items
802
  all_items = items + new_items
803
+ log.info(f"Loaded {len(new_items)} more items after id {last_loaded_id}, total now: {len(all_items)}")
804
  return all_items, total_count
805
 
806
  # Output definitions