Spaces:

navidved
/

tts_labeling

Running

App Files Files Community

vargha commited on Aug 10

Commit

bdb3c28

1 Parent(s): 26217ec

resume point and pagination

Browse files

Files changed (1) hide show

components/review_dashboard_page.py +180 -106

components/review_dashboard_page.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import gradio as gr
 import datetime
 import sentry_sdk
-from sqlalchemy import orm, func, or_
 from components.header import Header
 from utils.logger import Logger
@@ -12,6 +12,7 @@ from config import conf
 from utils.database import get_db
 from data.models import Annotation, TTSData, Annotator, Validation, AnnotationInterval
 from data.repository.annotator_workload_repo import AnnotatorWorkloadRepo
 log = Logger()
 LOADER = CloudServerAudioLoader(conf.FTP_URL)
@@ -205,27 +206,44 @@ class ReviewDashboardPage:
                     if not target_annotator_obj:
                         return f"⚠️ **Error:** Annotator '{target_annotator}' not found"
-                    # Restrict to assigned intervals, if any
-                    intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
-                    interval_filters = []
-                    for iv in intervals:
-                        if iv.start_index is not None and iv.end_index is not None:
-                            interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
-                    base_filters = [Annotation.annotator_id == target_annotator_obj.id]
-                    if interval_filters:
-                        base_filters.append(or_(*interval_filters))
-                    # Count total annotations for target annotator within intervals (if defined)
-                    total_count = db.query(Annotation).filter(*base_filters).count()
-                    # Count reviewed annotations (have validation from this reviewer)
-                    reviewed_count = db.query(Annotation).join(
-                        Validation, Annotation.id == Validation.annotation_id
-                    ).filter(
-                        *base_filters,
-                        Validation.validator_id == user_id
-                    ).count()
                     if total_count > 0:
                         percentage = (reviewed_count / total_count) * 100
@@ -321,87 +339,111 @@ class ReviewDashboardPage:
                 log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
-                # Build interval filters (restrict to annotator's assigned splits)
-                intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
-                interval_filters = []
-                for iv in intervals:
-                    if iv.start_index is not None and iv.end_index is not None:
-                        interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
-                base_filters = [Annotation.annotator_id == target_annotator_obj.id]
-                if interval_filters:
-                    base_filters.append(or_(*interval_filters))
-                # Reduced batch size for instant loading
-                INITIAL_BATCH_SIZE = 5
-                # Count total annotations within assigned range
-                total_count = db.query(Annotation).filter(*base_filters).count()
-                # Determine resume position based on last reviewed data id within range
                 all_reviewed = False
-                last_reviewed_tts_id = db.query(func.max(Annotation.tts_data_id)).join(
-                    Validation, Validation.annotation_id == Annotation.id
-                ).filter(
-                    *base_filters,
-                    Validation.validator_id == user_id
-                ).scalar()
-                start_tts_id = None
-                if last_reviewed_tts_id is not None:
-                    next_row = db.query(Annotation.tts_data_id).filter(
-                        *base_filters,
-                        Annotation.tts_data_id > last_reviewed_tts_id
-                    ).order_by(Annotation.tts_data_id.asc()).first()
-                    if next_row is not None:
-                        start_tts_id = next_row[0]
-                    else:
-                        all_reviewed = True
-                else:
-                    # No reviews yet: start at the first item in range
-                    first_row = db.query(func.min(Annotation.tts_data_id)).filter(*base_filters).scalar()
-                    start_tts_id = first_row
-                # Compute start offset
-                start_offset = 0
-                if start_tts_id is not None:
-                    start_offset = db.query(Annotation).filter(
-                        *base_filters,
-                        Annotation.tts_data_id < start_tts_id
                     ).count()
-                else:
-                    # no items in range
-                    start_offset = 0
-                # Query initial batch ordered by tts_data_id (data id)
-                initial_query = db.query(
-                    Annotation,
-                    TTSData.filename,
-                    TTSData.sentence
-                ).join(
-                    TTSData, Annotation.tts_data_id == TTSData.id
-                ).filter(
-                    *base_filters
-                ).order_by(Annotation.tts_data_id.asc()).offset(start_offset).limit(INITIAL_BATCH_SIZE)
-                initial_results = initial_query.all()
-                # If everything is reviewed or resume window empty, load the last batch so user can still browse
-                if (not initial_results and total_count > 0) or all_reviewed:
-                    fallback_offset = max(total_count - INITIAL_BATCH_SIZE, 0)
-                    initial_results = db.query(
                         Annotation,
                         TTSData.filename,
                         TTSData.sentence
                     ).join(
                         TTSData, Annotation.tts_data_id == TTSData.id
                     ).filter(
-                        *base_filters
-                    ).order_by(Annotation.tts_data_id.asc()).offset(fallback_offset).limit(INITIAL_BATCH_SIZE).all()
-                    start_offset = fallback_offset
-                    all_reviewed = True  # Ensure we set this so we start at the end of the batch
-                log.info(f"Fast initial load (offset {start_offset}): {len(initial_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
                 # Process items with minimal data - validation status will be loaded on-demand
                 items = []
@@ -423,9 +465,11 @@ class ReviewDashboardPage:
                     })
                 # Determine initial index inside the loaded batch
                 initial_idx = 0
                 if items and all_reviewed:
-                    initial_idx = len(items) - 1
                 # Set initial display
                 if items:
@@ -462,6 +506,13 @@ class ReviewDashboardPage:
                     # Ensure correct order and number of return values for empty items (14 outputs)
                     return [], 0, f"🔍 **Phase 2 Review Mode** - No annotations found for review.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
         def show_current_review_item_fn(items, idx, session):
             if not items or idx >= len(items) or idx < 0:
                 # tts_id, filename, sentence, ann_sentence, annotated_at, validation_status, annotator_name_placeholder, audio_update, rejection_reason_update, rejection_mode_reset, btn_reject_update
@@ -597,6 +648,13 @@ class ReviewDashboardPage:
                 return items, items[idx]["validation_status"], rejection_input_update
         def handle_rejection_fn(items, idx, session, rejection_reason, rejection_mode_active):
             """Handle rejection button click - two-step process"""
             if not items or idx >= len(items):
@@ -675,33 +733,49 @@ class ReviewDashboardPage:
                 if not target_annotator_obj:
                     return items, 0
-                # Restrict to assigned intervals
-                intervals = db.query(AnnotationInterval).filter_by(annotator_id=target_annotator_obj.id).all()
-                interval_filters = []
-                for iv in intervals:
-                    if iv.start_index is not None and iv.end_index is not None:
-                        interval_filters.append(Annotation.tts_data_id.between(iv.start_index, iv.end_index))
-                base_filters = [Annotation.annotator_id == target_annotator_obj.id]
-                if interval_filters:
-                    base_filters.append(or_(*interval_filters))
-                # Get total count for updated review info
-                total_count = db.query(Annotation).filter(*base_filters).count()
-                # Determine the next window based on the last loaded tts_data_id
-                last_loaded_tts_id = items[-1]["tts_id"] if items else 0
-                # Use tts_data_id-based pagination to continue from current position
                 query = db.query(
                     Annotation,
                     TTSData.filename,
                     TTSData.sentence
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
                 ).filter(
-                    *base_filters,
-                    Annotation.tts_data_id > last_loaded_tts_id
-                ).order_by(Annotation.tts_data_id.asc()).limit(current_batch_size)
                 results = query.all()
@@ -726,7 +800,7 @@ class ReviewDashboardPage:
                 # Combine with existing items
                 all_items = items + new_items
-                log.info(f"Loaded {len(new_items)} more items after data id {last_loaded_tts_id}, total now: {len(all_items)}")
                 return all_items, total_count
         # Output definitions

 import gradio as gr
 import datetime
 import sentry_sdk
+from sqlalchemy import orm
 from components.header import Header
 from utils.logger import Logger
 from utils.database import get_db
 from data.models import Annotation, TTSData, Annotator, Validation, AnnotationInterval
 from data.repository.annotator_workload_repo import AnnotatorWorkloadRepo
+from sqlalchemy import and_
 log = Logger()
 LOADER = CloudServerAudioLoader(conf.FTP_URL)
                     if not target_annotator_obj:
                         return f"⚠️ **Error:** Annotator '{target_annotator}' not found"
+                    # Get the target annotator's assigned intervals
+                    assigned_intervals = db.query(AnnotationInterval).filter(
+                        AnnotationInterval.annotator_id == target_annotator_obj.id
+                    ).all()
+                    if not assigned_intervals:
+                        return f"⚠️ **Error:** No assigned intervals for annotator '{target_annotator}'"
+                    # Count total annotations within assigned intervals for target annotator
+                    total_count = 0
+                    for interval in assigned_intervals:
+                        if interval.start_index is None or interval.end_index is None:
+                            continue
+                        interval_count = db.query(Annotation).join(
+                            TTSData, Annotation.tts_data_id == TTSData.id
+                        ).filter(
+                            Annotation.annotator_id == target_annotator_obj.id,
+                            TTSData.id >= interval.start_index,
+                            TTSData.id <= interval.end_index
+                        ).count()
+                        total_count += interval_count
+                    # Count reviewed annotations within assigned intervals (have validation from this reviewer)
+                    reviewed_count = 0
+                    for interval in assigned_intervals:
+                        if interval.start_index is None or interval.end_index is None:
+                            continue
+                        interval_reviewed = db.query(Annotation).join(
+                            TTSData, Annotation.tts_data_id == TTSData.id
+                        ).join(
+                            Validation, Annotation.id == Validation.annotation_id
+                        ).filter(
+                            Annotation.annotator_id == target_annotator_obj.id,
+                            TTSData.id >= interval.start_index,
+                            TTSData.id <= interval.end_index,
+                            Validation.validator_id == user_id
+                        ).count()
+                        reviewed_count += interval_reviewed
                     if total_count > 0:
                         percentage = (reviewed_count / total_count) * 100
                 log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
+                # FAST INITIAL QUERY: Load only essential data without complex validation processing
+                # Reduced batch size for instant loading in HuggingFace spaces
+                INITIAL_BATCH_SIZE = 5  # Load only 5 items initially for instant response
+                # Get the target annotator's assigned intervals
+                assigned_intervals = db.query(AnnotationInterval).filter(
+                    AnnotationInterval.annotator_id == target_annotator_obj.id
+                ).all()
+                if not assigned_intervals:
+                    log.warning(f"No assigned intervals found for annotator {target_annotator}")
+                    return [], 0, f"Review Target Error: No assigned intervals for annotator '{target_annotator}'.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
+                # Find the first UNREVIEWED annotation within assigned intervals for this reviewer
                 all_reviewed = False
+                first_unreviewed_tts_id = None
+                # Query for the first TTS data ID within assigned intervals that has no validation by this reviewer
+                for interval in assigned_intervals:
+                    if interval.start_index is None or interval.end_index is None:
+                        continue
+                    unreviewed_query = db.query(TTSData.id).join(
+                        Annotation, Annotation.tts_data_id == TTSData.id
+                    ).outerjoin(
+                        Validation,
+                        (Validation.annotation_id == Annotation.id) & (Validation.validator_id == user_id)
+                    ).filter(
+                        Annotation.annotator_id == target_annotator_obj.id,
+                        TTSData.id >= interval.start_index,
+                        TTSData.id <= interval.end_index,
+                        Validation.id.is_(None)  # No validation by this reviewer (fixed SQLAlchemy syntax)
+                    ).order_by(TTSData.id.asc()).first()
+                    if unreviewed_query:
+                        first_unreviewed_tts_id = unreviewed_query[0]
+                        break
+                if first_unreviewed_tts_id is None:
+                    # Everything reviewed: flag and we will load the last batch from the last interval
+                    all_reviewed = True
+                # Count total annotations within assigned intervals for progress info
+                total_count = 0
+                for interval in assigned_intervals:
+                    if interval.start_index is None or interval.end_index is None:
+                        continue
+                    interval_count = db.query(Annotation).join(
+                        TTSData, Annotation.tts_data_id == TTSData.id
+                    ).filter(
+                        Annotation.annotator_id == target_annotator_obj.id,
+                        TTSData.id >= interval.start_index,
+                        TTSData.id <= interval.end_index
                     ).count()
+                    total_count += interval_count
+                # Query to get annotations starting from the first unreviewed item
+                if not all_reviewed and first_unreviewed_tts_id:
+                    # Load from first unreviewed TTS ID
+                    initial_query = db.query(
                         Annotation,
                         TTSData.filename,
                         TTSData.sentence
                     ).join(
                         TTSData, Annotation.tts_data_id == TTSData.id
+                    ).join(
+                        AnnotationInterval,
+                        and_(
+                            AnnotationInterval.annotator_id == target_annotator_obj.id,
+                            TTSData.id >= AnnotationInterval.start_index,
+                            TTSData.id <= AnnotationInterval.end_index
+                        )
                     ).filter(
+                        Annotation.annotator_id == target_annotator_obj.id,
+                        TTSData.id >= first_unreviewed_tts_id
+                    ).order_by(TTSData.id).limit(INITIAL_BATCH_SIZE)
+                    initial_results = initial_query.all()
+                else:
+                    # Everything reviewed or no unreviewed items: load the last batch from assigned intervals
+                    all_reviewed = True
+                    if assigned_intervals and total_count > 0:
+                        # Find the last interval and load the last batch from there
+                        last_interval = max(assigned_intervals, key=lambda x: x.end_index or 0)
+                        if last_interval.start_index is not None and last_interval.end_index is not None:
+                            initial_query = db.query(
+                                Annotation,
+                                TTSData.filename,
+                                TTSData.sentence
+                            ).join(
+                                TTSData, Annotation.tts_data_id == TTSData.id
+                            ).filter(
+                                Annotation.annotator_id == target_annotator_obj.id,
+                                TTSData.id >= last_interval.start_index,
+                                TTSData.id <= last_interval.end_index
+                            ).order_by(TTSData.id.desc()).limit(INITIAL_BATCH_SIZE)
+                            initial_results = initial_query.all()
+                            initial_results.reverse()  # Restore ascending order
+                        else:
+                            initial_results = []
+                    else:
+                        initial_results = []
+                log.info(f"Fast initial load: {len(initial_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
                 # Process items with minimal data - validation status will be loaded on-demand
                 items = []
                     })
                 # Determine initial index inside the loaded batch
+                # - Normal case (has unreviewed): start at 0 (first unreviewed)
+                # - All reviewed: start at last item in the batch for browsing
                 initial_idx = 0
                 if items and all_reviewed:
+                    initial_idx = len(items) - 1
                 # Set initial display
                 if items:
                     # Ensure correct order and number of return values for empty items (14 outputs)
                     return [], 0, f"🔍 **Phase 2 Review Mode** - No annotations found for review.", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
+                # except Exception as e:
+                #     log.error(f"Error loading review items: {e}")
+                #     sentry_sdk.capture_exception(e)
+                #     gr.Error(f"Failed to load review data: {e}")
+                #     # Ensure correct order and number of return values for error case (14 outputs)
+                #     return [], 0, "", "", "", "", "", "", "", "", gr.update(value=None, autoplay=False), gr.update(visible=False, value=""), False, gr.update(value="❌ Reject")
         def show_current_review_item_fn(items, idx, session):
             if not items or idx >= len(items) or idx < 0:
                 # tts_id, filename, sentence, ann_sentence, annotated_at, validation_status, annotator_name_placeholder, audio_update, rejection_reason_update, rejection_mode_reset, btn_reject_update
                 return items, items[idx]["validation_status"], rejection_input_update
+                # except Exception as e:
+                #     db.rollback()
+                #     log.error(f"Error saving validation: {e}")
+                #     sentry_sdk.capture_exception(e)
+                #     gr.Error(f"Failed to save validation: {e}")
+                #     return items, current_item["validation_status"], gr.update(visible=False) # Return original status and hide input on error
         def handle_rejection_fn(items, idx, session, rejection_reason, rejection_mode_active):
             """Handle rejection button click - two-step process"""
             if not items or idx >= len(items):
                 if not target_annotator_obj:
                     return items, 0
+                # Get the target annotator's assigned intervals
+                assigned_intervals = db.query(AnnotationInterval).filter(
+                    AnnotationInterval.annotator_id == target_annotator_obj.id
+                ).all()
+                if not assigned_intervals:
+                    return items, 0
+                # Count total annotations within assigned intervals for progress info
+                total_count = 0
+                for interval in assigned_intervals:
+                    if interval.start_index is None or interval.end_index is None:
+                        continue
+                    interval_count = db.query(Annotation).join(
+                        TTSData, Annotation.tts_data_id == TTSData.id
+                    ).filter(
+                        Annotation.annotator_id == target_annotator_obj.id,
+                        TTSData.id >= interval.start_index,
+                        TTSData.id <= interval.end_index
+                    ).count()
+                    total_count += interval_count
+                # Determine the next window based on the last loaded annotation id
+                last_loaded_id = items[-1]["annotation_id"] if items else 0
+                # FAST LOADING: Use id-based pagination within assigned intervals to continue from current position
                 query = db.query(
                     Annotation,
                     TTSData.filename,
                     TTSData.sentence
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
+                ).join(
+                    AnnotationInterval,
+                    and_(
+                        AnnotationInterval.annotator_id == target_annotator_obj.id,
+                        TTSData.id >= AnnotationInterval.start_index,
+                        TTSData.id <= AnnotationInterval.end_index
+                    )
                 ).filter(
+                    Annotation.annotator_id == target_annotator_obj.id,
+                    Annotation.id > last_loaded_id
+                ).order_by(Annotation.id).limit(current_batch_size)
                 results = query.all()
                 # Combine with existing items
                 all_items = items + new_items
+                log.info(f"Loaded {len(new_items)} more items after id {last_loaded_id}, total now: {len(all_items)}")
                 return all_items, total_count
         # Output definitions