richard-su commited on
Commit
d689086
Β·
verified Β·
1 Parent(s): c88a159

Upload folder using huggingface_hub

Browse files
src/app.py CHANGED
@@ -24,7 +24,7 @@ except ImportError:
24
  # ==================== Application Creation Function ====================
25
 
26
  def create_app():
27
- """Create and return Gradio application with MCP tools integrated"""
28
 
29
  print("πŸš€ Starting Gradio + FastMCP server")
30
 
@@ -68,20 +68,7 @@ def create_app():
68
  ):
69
  return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
70
 
71
- # Create Gradio interface
72
- print("🎨 Creating Gradio interface...")
73
- gradio_app = create_gradio_interface()
74
-
75
- # For HF Spaces, return Gradio app directly
76
- if os.environ.get("HF_SPACES_MODE") == "1":
77
- print("πŸ€— HF Spaces mode: returning Gradio app directly")
78
- print("βœ… Server startup completed")
79
- print("🎨 Gradio UI: /")
80
- print(f"πŸ“ Server name: {mcp.name}")
81
- return gradio_app
82
-
83
- # For other environments, create FastAPI wrapper with MCP
84
- print("πŸ”§ Creating FastAPI wrapper...")
85
  fastapi_wrapper = FastAPI(
86
  title="Modal AudioTranscriber MCP",
87
  description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
@@ -95,10 +82,13 @@ def create_app():
95
  # Mount FastMCP application to /api path
96
  fastapi_wrapper.mount("/api", mcp_app)
97
 
 
 
 
98
  # Use Gradio's standard mounting approach
99
  final_app = mount_gradio_app(
100
  app=fastapi_wrapper,
101
- blocks=gradio_app,
102
  path="",
103
  app_kwargs={
104
  "docs_url": "/docs",
@@ -138,31 +128,25 @@ if _modal_available:
138
 
139
  # ==================== Main Entry Point ====================
140
 
141
- def run_local():
142
- """Run local server with uvicorn"""
143
- print("🏠 Starting in local mode")
144
-
145
- # Set default environment
146
- os.environ.setdefault("DEPLOYMENT_MODE", "local")
147
-
148
- app = create_app()
149
-
150
- # Use port 7860 for HF Spaces compatibility
151
- port = int(os.environ.get("PORT", 7860))
152
 
153
- print(f"🌐 Starting server on port {port}")
154
-
155
- # For HF Spaces with Gradio app, launch directly
156
- if os.environ.get("HF_SPACES_MODE") == "1" and hasattr(app, 'launch'):
157
- print("πŸ€— Launching Gradio app for HF Spaces")
158
- app.launch(
159
- server_name="0.0.0.0",
160
- server_port=port,
161
- share=False,
162
- show_error=True
163
- )
164
  else:
165
- # For other environments, use uvicorn
 
 
 
 
 
 
 
 
 
 
 
166
  uvicorn.run(
167
  app,
168
  host="0.0.0.0",
@@ -170,5 +154,17 @@ def run_local():
170
  reload=False
171
  )
172
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if __name__ == "__main__":
174
  run_local()
 
24
  # ==================== Application Creation Function ====================
25
 
26
  def create_app():
27
+ """Create and return complete Gradio + MCP application"""
28
 
29
  print("πŸš€ Starting Gradio + FastMCP server")
30
 
 
68
  ):
69
  return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
70
 
71
+ # Create FastAPI wrapper
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  fastapi_wrapper = FastAPI(
73
  title="Modal AudioTranscriber MCP",
74
  description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
 
82
  # Mount FastMCP application to /api path
83
  fastapi_wrapper.mount("/api", mcp_app)
84
 
85
+ # Create Gradio interface
86
+ ui_app = create_gradio_interface()
87
+
88
  # Use Gradio's standard mounting approach
89
  final_app = mount_gradio_app(
90
  app=fastapi_wrapper,
91
+ blocks=ui_app,
92
  path="",
93
  app_kwargs={
94
  "docs_url": "/docs",
 
128
 
129
  # ==================== Main Entry Point ====================
130
 
131
+ def main():
132
+ """Main entry point for all deployment modes"""
 
 
 
 
 
 
 
 
 
133
 
134
+ if is_modal_mode():
135
+ print("☁️ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
136
+ return None
 
 
 
 
 
 
 
 
137
  else:
138
+ print("🏠 Starting in local mode")
139
+ print("πŸ’‘ GPU functions will be routed to Modal endpoints")
140
+
141
+ app = create_app()
142
+ return app
143
+
144
+ def run_local():
145
+ """Run local server with uvicorn (for direct execution)"""
146
+ app = main()
147
+ if app:
148
+ # Use port 7860 for HF Spaces compatibility, 8000 for local
149
+ port = int(os.environ.get("PORT", 7860)) # HF Spaces uses port 7860
150
  uvicorn.run(
151
  app,
152
  host="0.0.0.0",
 
154
  reload=False
155
  )
156
 
157
+ # ==================== Hugging Face Spaces Support ====================
158
+
159
+ # For Hugging Face Spaces, directly create the app
160
+ def get_app():
161
+ """Get app instance for HF Spaces"""
162
+ if "DEPLOYMENT_MODE" not in os.environ:
163
+ os.environ["DEPLOYMENT_MODE"] = "local"
164
+ return main()
165
+
166
+ # Create app for HF Spaces when imported
167
+ app = get_app() # Always create app for HF Spaces
168
+
169
  if __name__ == "__main__":
170
  run_local()
src/services/__pycache__/distributed_transcription_service.cpython-310.pyc CHANGED
Binary files a/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc and b/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc differ
 
src/services/distributed_transcription_service.py CHANGED
@@ -598,32 +598,54 @@ class DistributedTranscriptionService:
598
  known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
599
  unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
600
 
601
- print(f"πŸ“Š Segment distribution:")
602
- print(f" Known speakers: {len(known_speaker_segments)} segments")
603
- print(f" Unknown speakers: {len(unknown_speaker_segments)} segments (will be filtered)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
- # Generate output files (excluding UNKNOWN speakers)
606
  output_files = self._generate_output_files(
607
- known_speaker_segments, # Only include segments with known speakers
608
  output_format,
609
- enable_speaker_diarization
610
  )
611
 
612
  # Collect speaker information based on filtered segments
613
  speaker_info = self._collect_speaker_information_from_segments(
614
- known_speaker_segments, enable_speaker_diarization
615
  )
616
 
617
  # Determine language (use most common language from chunks)
618
  languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
619
  most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
620
 
621
- # Combine text from known speaker segments only
622
- full_text = " ".join([seg.get("text", "").strip() for seg in known_speaker_segments if seg.get("text", "").strip()])
623
 
624
  print(f"πŸ”— merge_chunk_results completion summary:")
625
  print(f" Total segments collected: {len(all_segments)}")
626
- print(f" Known speaker segments: {len(known_speaker_segments)}")
627
  print(f" Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
628
  print(f" Final text length: {len(full_text)} characters")
629
  print(f" Language detected: {most_common_language}")
@@ -634,9 +656,9 @@ class DistributedTranscriptionService:
634
  "txt_file_path": output_files.get("txt_file_path"),
635
  "srt_file_path": output_files.get("srt_file_path"),
636
  "audio_duration": total_duration,
637
- "segment_count": len(known_speaker_segments), # Count only known speaker segments
638
- "total_segments_collected": len(all_segments), # Total including UNKNOWN
639
- "unknown_segments_filtered": len(unknown_speaker_segments), # UNKNOWN segments count
640
  "language_detected": most_common_language,
641
  "model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
642
  "distributed_processing": True,
@@ -644,8 +666,8 @@ class DistributedTranscriptionService:
644
  "chunks_failed": len(failed_chunks),
645
  "speaker_diarization_enabled": enable_speaker_diarization,
646
  "speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
647
- "text": full_text, # Add full text for client-side file saving (filtered)
648
- "segments": known_speaker_segments, # Add segments for client-side file saving (filtered)
649
  **speaker_info
650
  }
651
 
@@ -665,9 +687,9 @@ class DistributedTranscriptionService:
665
  self,
666
  segments: List[Dict],
667
  output_format: str,
668
- enable_speaker_diarization: bool
669
  ) -> Dict[str, str]:
670
- """Generate output files from merged segments (excluding UNKNOWN speakers)"""
671
  try:
672
  # Create output directory
673
  output_dir = Path(self.cache_dir) / "transcribe"
@@ -685,9 +707,15 @@ class DistributedTranscriptionService:
685
  text = segment.get("text", "").strip()
686
  speaker = segment.get("speaker", "UNKNOWN")
687
 
688
- # Skip segments with no text or UNKNOWN speaker
689
- if text and speaker != "UNKNOWN":
690
- valid_segments.append(segment)
 
 
 
 
 
 
691
 
692
  print(f"πŸ“ Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
693
 
@@ -696,7 +724,7 @@ class DistributedTranscriptionService:
696
  with open(txt_path, "w", encoding="utf-8") as f:
697
  for segment in valid_segments:
698
  text = segment.get("text", "").strip()
699
- if enable_speaker_diarization and "speaker" in segment:
700
  f.write(f"[{segment['speaker']}] {text}\n")
701
  else:
702
  f.write(f"{text}\n")
@@ -712,7 +740,7 @@ class DistributedTranscriptionService:
712
  end_time = self._format_srt_time(segment.get("end", 0))
713
  text = segment.get("text", "").strip()
714
 
715
- if enable_speaker_diarization and "speaker" in segment:
716
  text = f"[{segment['speaker']}] {text}"
717
 
718
  f.write(f"{srt_index}\n")
 
598
  known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
599
  unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
600
 
601
+ # Only filter UNKNOWN speakers if:
602
+ # 1. Speaker diarization is enabled, AND
603
+ # 2. There are some known speakers (meaning diarization was successful)
604
+ should_filter_unknown = enable_speaker_diarization and len(known_speaker_segments) > 0
605
+
606
+ if should_filter_unknown:
607
+ print(f"πŸ“Š Segment distribution (diarization enabled, filtering UNKNOWN):")
608
+ print(f" Known speakers: {len(known_speaker_segments)} segments")
609
+ print(f" Unknown speakers: {len(unknown_speaker_segments)} segments (will be filtered)")
610
+
611
+ # Use only known speaker segments
612
+ segments_for_output = known_speaker_segments
613
+ else:
614
+ # When diarization is disabled OR no speakers were successfully identified,
615
+ # use all segments regardless of speaker label
616
+ if enable_speaker_diarization:
617
+ print(f"πŸ“Š Segment distribution (diarization enabled, but no speakers identified):")
618
+ print(f" All segments: {len(all_segments)} segments (no speaker filtering - diarization failed)")
619
+ else:
620
+ print(f"πŸ“Š Segment distribution (diarization disabled):")
621
+ print(f" All segments: {len(all_segments)} segments (no speaker filtering)")
622
+
623
+ # Use all segments
624
+ segments_for_output = all_segments
625
+ unknown_speaker_segments = [] # Don't count as filtered if we're not filtering
626
 
627
+ # Generate output files
628
  output_files = self._generate_output_files(
629
+ segments_for_output,
630
  output_format,
631
+ should_filter_unknown
632
  )
633
 
634
  # Collect speaker information based on filtered segments
635
  speaker_info = self._collect_speaker_information_from_segments(
636
+ segments_for_output, enable_speaker_diarization
637
  )
638
 
639
  # Determine language (use most common language from chunks)
640
  languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
641
  most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
642
 
643
+ # Combine text from segments used for output
644
+ full_text = " ".join([seg.get("text", "").strip() for seg in segments_for_output if seg.get("text", "").strip()])
645
 
646
  print(f"πŸ”— merge_chunk_results completion summary:")
647
  print(f" Total segments collected: {len(all_segments)}")
648
+ print(f" Output segments: {len(segments_for_output)}")
649
  print(f" Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
650
  print(f" Final text length: {len(full_text)} characters")
651
  print(f" Language detected: {most_common_language}")
 
656
  "txt_file_path": output_files.get("txt_file_path"),
657
  "srt_file_path": output_files.get("srt_file_path"),
658
  "audio_duration": total_duration,
659
+ "segment_count": len(segments_for_output), # Count segments used for output
660
+ "total_segments_collected": len(all_segments), # Total including any filtered segments
661
+ "unknown_segments_filtered": len(unknown_speaker_segments), # UNKNOWN segments count (0 if diarization disabled)
662
  "language_detected": most_common_language,
663
  "model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
664
  "distributed_processing": True,
 
666
  "chunks_failed": len(failed_chunks),
667
  "speaker_diarization_enabled": enable_speaker_diarization,
668
  "speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
669
+ "text": full_text, # Add full text for client-side file saving
670
+ "segments": segments_for_output, # Add segments for client-side file saving
671
  **speaker_info
672
  }
673
 
 
687
  self,
688
  segments: List[Dict],
689
  output_format: str,
690
+ should_filter_unknown: bool
691
  ) -> Dict[str, str]:
692
+ """Generate output files from merged segments (filter UNKNOWN speakers only if should_filter_unknown is True)"""
693
  try:
694
  # Create output directory
695
  output_dir = Path(self.cache_dir) / "transcribe"
 
707
  text = segment.get("text", "").strip()
708
  speaker = segment.get("speaker", "UNKNOWN")
709
 
710
+ # Skip segments with no text
711
+ if not text:
712
+ continue
713
+
714
+ # Only skip UNKNOWN speakers if filtering is enabled
715
+ if should_filter_unknown and speaker == "UNKNOWN":
716
+ continue
717
+
718
+ valid_segments.append(segment)
719
 
720
  print(f"πŸ“ Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
721
 
 
724
  with open(txt_path, "w", encoding="utf-8") as f:
725
  for segment in valid_segments:
726
  text = segment.get("text", "").strip()
727
+ if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
728
  f.write(f"[{segment['speaker']}] {text}\n")
729
  else:
730
  f.write(f"{text}\n")
 
740
  end_time = self._format_srt_time(segment.get("end", 0))
741
  text = segment.get("text", "").strip()
742
 
743
+ if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
744
  text = f"[{segment['speaker']}] {text}"
745
 
746
  f.write(f"{srt_index}\n")