Spaces:

navidved
/

tts_labeling

Running

App Files Files Community

vargha commited on Aug 10

Commit

2d16b47

1 Parent(s): e0f4aa9

resume point and pagination

Browse files

Files changed (1) hide show

components/review_dashboard_page.py +68 -82

components/review_dashboard_page.py CHANGED Viewed

@@ -312,38 +312,55 @@ class ReviewDashboardPage:
                 log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
-                # RESUME-OPTIMIZED LOADING: Load enough items to find unreviewed annotations
-                # Start with a reasonable batch size, but expand if no unreviewed items found
-                INITIAL_BATCH_SIZE = 20  # Increased from 5 to better find unreviewed items
-                MAX_SEARCH_BATCH = 100   # Maximum to search for unreviewed items before giving up
-                # Query to find first unreviewed annotation efficiently
-                # First, try to find annotations that haven't been validated by this reviewer
-                unreviewed_query = db.query(
                     Annotation,
                     TTSData.filename,
                     TTSData.sentence
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
-                ).outerjoin(
-                    Validation, (Validation.annotation_id == Annotation.id) & (Validation.validator_id == user_id)
                 ).filter(
-                    Annotation.annotator_id == target_annotator_obj.id,
-                    Validation.id.is_(None)  # No validation record exists for this reviewer
-                ).order_by(Annotation.id).limit(INITIAL_BATCH_SIZE)
-                unreviewed_results = unreviewed_query.all()
-                # If we found unreviewed items, use those; otherwise fall back to loading from the beginning
-                if unreviewed_results:
-                    log.info(f"Found {len(unreviewed_results)} unreviewed annotations for resume")
-                    query_results = unreviewed_results
-                    # All items in this result are unreviewed, so we'll start from the first one
-                    resume_from_unreviewed = True
-                else:
-                    log.info("No unreviewed annotations found, loading from beginning")
-                    # Fall back to original query - load from beginning
-                    original_query = db.query(
                         Annotation,
                         TTSData.filename,
                         TTSData.sentence
@@ -351,43 +368,19 @@ class ReviewDashboardPage:
                         TTSData, Annotation.tts_data_id == TTSData.id
                     ).filter(
                         Annotation.annotator_id == target_annotator_obj.id
-                    ).order_by(Annotation.id).limit(INITIAL_BATCH_SIZE)
-                    query_results = original_query.all()
-                    resume_from_unreviewed = False
-                # Get total count for progress info (this is fast)
-                total_count = db.query(Annotation).filter(
-                    Annotation.annotator_id == target_annotator_obj.id
-                ).count()
-                log.info(f"Initial load: {len(query_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
-                # Process items and check validation status during initial loading for resume functionality
                 items = []
-                first_unreviewed_idx = -1
-                user_id = session.get("user_id")
-                for i, (annotation, filename, sentence) in enumerate(query_results):
                     # Check if annotation is deleted (minimal processing)
                     is_deleted = not annotation.annotated_sentence or annotation.annotated_sentence.strip() == ""
                     annotated_sentence_display = "[DELETED ANNOTATION]" if is_deleted else annotation.annotated_sentence
-                    # Check validation status - if we loaded unreviewed items, they're all unreviewed
-                    if resume_from_unreviewed:
-                        validation_status = "Not Reviewed (Deleted)" if is_deleted else "Not Reviewed"
-                    else:
-                        # For regular loading, check validation status
-                        validation_status, _ = get_validation_status_for_item(db, annotation.id, user_id, annotation)
-                    # Track first unreviewed item for resume functionality
-                    if first_unreviewed_idx == -1 and validation_status.startswith("Not Reviewed"):
-                        # Prioritize non-deleted annotations for resume point
-                        if not is_deleted:
-                            first_unreviewed_idx = i
-                        elif first_unreviewed_idx == -1:  # If no non-deleted found yet, accept deleted as fallback
-                            first_unreviewed_idx = i
                     items.append({
                         "annotation_id": annotation.id,
                         "tts_id": annotation.tts_data_id,
@@ -396,30 +389,22 @@ class ReviewDashboardPage:
                         "annotated_sentence": annotated_sentence_display,
                         "is_deleted": is_deleted,
                         "annotated_at": annotation.annotated_at.isoformat() if annotation.annotated_at else "",
-                        "validation_status": validation_status,  # Loaded during initial load for resume functionality
-                        "validation_loaded": True  # Mark as loaded since we just loaded it
                     })
-                # Resume Logic: Set initial index based on loading strategy
                 initial_idx = 0
-                if items:
-                    if resume_from_unreviewed:
-                        # We loaded unreviewed items, so start from the first one (prioritize non-deleted)
-                        for i, item in enumerate(items):
-                            if not item.get("is_deleted", False):
-                                initial_idx = i
-                                break
-                        # If all are deleted, start from first
-                        if initial_idx == 0 and items[0].get("is_deleted", False):
-                            initial_idx = 0
-                        log.info(f"Reviewer '{username}' resuming from unreviewed annotations, starting at index: {initial_idx} (annotation ID: {items[initial_idx]['annotation_id']})")
-                    elif first_unreviewed_idx != -1:
-                        initial_idx = first_unreviewed_idx
-                        log.info(f"Reviewer '{username}' resuming at first unreviewed item, index: {initial_idx} (annotation ID: {items[initial_idx]['annotation_id']})")
-                    else:
-                        # All items in this batch are reviewed, start from the last item
-                        initial_idx = len(items) - 1 if items else 0
-                        log.info(f"Reviewer '{username}' has no unreviewed items in current batch, starting at last item, index: {initial_idx}")
                 # Set initial display
                 if items:
@@ -687,11 +672,11 @@ class ReviewDashboardPage:
                 total_count = db.query(Annotation).filter(
                     Annotation.annotator_id == target_annotator_obj.id
                 ).count()
-                # Load next batch starting from where we left off
-                offset = len(items)
-                # FAST LOADING: Use same strategy as initial load - simple query without complex JOINs
                 query = db.query(
                     Annotation,
                     TTSData.filename,
@@ -699,8 +684,9 @@ class ReviewDashboardPage:
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
                 ).filter(
-                    Annotation.annotator_id == target_annotator_obj.id
-                ).order_by(Annotation.id).offset(offset).limit(current_batch_size)
                 results = query.all()
@@ -725,7 +711,7 @@ class ReviewDashboardPage:
                 # Combine with existing items
                 all_items = items + new_items
-                log.info(f"Loaded {len(new_items)} more items, total now: {len(all_items)}")
                 return all_items, total_count
         # Output definitions

                 log.info(f"Found target annotator with ID: {target_annotator_obj.id}")
+                # FAST INITIAL QUERY: Load only essential data without complex validation processing
+                # Reduced batch size for instant loading in HuggingFace spaces
+                INITIAL_BATCH_SIZE = 5  # Load only 5 items initially for instant response
+                # Determine resume position: find the first UNREVIEWED annotation for this reviewer
+                # If none found (everything reviewed), we'll fall back to the last batch
+                all_reviewed = False
+                first_unreviewed_row = db.query(Annotation.id).outerjoin(
+                    Validation,
+                    (Validation.annotation_id == Annotation.id) & (Validation.validator_id == user_id)
+                ).filter(
+                    Annotation.annotator_id == target_annotator_obj.id,
+                    Validation.id == None  # No validation by this reviewer
+                ).order_by(Annotation.id.asc()).first()
+                # Count total annotations for progress info (this is fast)
+                total_count = db.query(Annotation).filter(
+                    Annotation.annotator_id == target_annotator_obj.id
+                ).count()
+                # Compute start offset so that the first item in the loaded batch is the first unreviewed
+                start_offset = 0
+                if first_unreviewed_row is not None:
+                    first_unreviewed_id = first_unreviewed_row[0]
+                    start_offset = db.query(Annotation).filter(
+                        Annotation.annotator_id == target_annotator_obj.id,
+                        Annotation.id < first_unreviewed_id
+                    ).count()
+                else:
+                    # Everything reviewed: flag and we will load the last batch
+                    all_reviewed = True
+                # Simple query to get basic annotation data quickly, starting from resume offset
+                initial_query = db.query(
                     Annotation,
                     TTSData.filename,
                     TTSData.sentence
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
                 ).filter(
+                    Annotation.annotator_id == target_annotator_obj.id
+                ).order_by(Annotation.id).offset(start_offset).limit(INITIAL_BATCH_SIZE)
+                initial_results = initial_query.all()
+                # If everything is reviewed or resume window empty, load the last batch so user can still browse
+                if (not initial_results and total_count > 0) or all_reviewed:
+                    fallback_offset = max(total_count - INITIAL_BATCH_SIZE, 0)
+                    initial_results = db.query(
                         Annotation,
                         TTSData.filename,
                         TTSData.sentence
                         TTSData, Annotation.tts_data_id == TTSData.id
                     ).filter(
                         Annotation.annotator_id == target_annotator_obj.id
+                    ).order_by(Annotation.id).offset(fallback_offset).limit(INITIAL_BATCH_SIZE).all()
+                    start_offset = fallback_offset
+                    all_reviewed = True  # Ensure we set this so we start at the end of the batch
+                log.info(f"Fast initial load (offset {start_offset}): {len(initial_results)} annotations out of {total_count} total for target annotator ID {target_annotator_obj.id}")
+                # Process items with minimal data - validation status will be loaded on-demand
                 items = []
+                for annotation, filename, sentence in initial_results:
                     # Check if annotation is deleted (minimal processing)
                     is_deleted = not annotation.annotated_sentence or annotation.annotated_sentence.strip() == ""
                     annotated_sentence_display = "[DELETED ANNOTATION]" if is_deleted else annotation.annotated_sentence
                     items.append({
                         "annotation_id": annotation.id,
                         "tts_id": annotation.tts_data_id,
                         "annotated_sentence": annotated_sentence_display,
                         "is_deleted": is_deleted,
                         "annotated_at": annotation.annotated_at.isoformat() if annotation.annotated_at else "",
+                        "validation_status": "Loading...",  # Will be loaded on-demand
+                        "validation_loaded": False  # Track if validation status has been loaded
                     })
+                # Determine initial index inside the loaded batch
+                # - Normal case (has unreviewed): start at 0 (first unreviewed)
+                # - All reviewed: start at last item in the batch for browsing
                 initial_idx = 0
+                if items and all_reviewed:
+                    initial_idx = len(items) - 1
+                elif items and start_offset + len(items) >= total_count and first_unreviewed_row is not None and start_offset >= total_count:
+                    initial_idx = len(items) - 1
+                elif items and start_offset == max(total_count - INITIAL_BATCH_SIZE, 0) and (first_unreviewed_row is not None and start_offset >= total_count):
+                    initial_idx = len(items) - 1
+                elif items and not initial_results and total_count == 0:
+                    initial_idx = 0
                 # Set initial display
                 if items:
                 total_count = db.query(Annotation).filter(
                     Annotation.annotator_id == target_annotator_obj.id
                 ).count()
+                # Determine the next window based on the last loaded annotation id
+                last_loaded_id = items[-1]["annotation_id"] if items else 0
+                # FAST LOADING: Use id-based pagination to continue from current position
                 query = db.query(
                     Annotation,
                     TTSData.filename,
                 ).join(
                     TTSData, Annotation.tts_data_id == TTSData.id
                 ).filter(
+                    Annotation.annotator_id == target_annotator_obj.id,
+                    Annotation.id > last_loaded_id
+                ).order_by(Annotation.id).limit(current_batch_size)
                 results = query.all()
                 # Combine with existing items
                 all_items = items + new_items
+                log.info(f"Loaded {len(new_items)} more items after id {last_loaded_id}, total now: {len(all_items)}")
                 return all_items, total_count
         # Output definitions