Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

App Files Files Community

richard-su commited on 27 days ago

Commit

d689086

verified ·

1 Parent(s): c88a159

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

src/app.py +35 -39
src/services/__pycache__/distributed_transcription_service.cpython-310.pyc +0 -0
src/services/distributed_transcription_service.py +50 -22

src/app.py CHANGED Viewed

@@ -24,7 +24,7 @@ except ImportError:
 # ==================== Application Creation Function ====================
 def create_app():
-    """Create and return Gradio application with MCP tools integrated"""
     print("🚀 Starting Gradio + FastMCP server")
@@ -68,20 +68,7 @@ def create_app():
     ):
         return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
-    # Create Gradio interface
-    print("🎨 Creating Gradio interface...")
-    gradio_app = create_gradio_interface()
-    # For HF Spaces, return Gradio app directly
-    if os.environ.get("HF_SPACES_MODE") == "1":
-        print("🤗 HF Spaces mode: returning Gradio app directly")
-        print("✅ Server startup completed")
-        print("🎨 Gradio UI: /")
-        print(f"📝 Server name: {mcp.name}")
-        return gradio_app
-    # For other environments, create FastAPI wrapper with MCP
-    print("🔧 Creating FastAPI wrapper...")
     fastapi_wrapper = FastAPI(
         title="Modal AudioTranscriber MCP",
         description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
@@ -95,10 +82,13 @@ def create_app():
     # Mount FastMCP application to /api path
     fastapi_wrapper.mount("/api", mcp_app)
     # Use Gradio's standard mounting approach
     final_app = mount_gradio_app(
         app=fastapi_wrapper,
-        blocks=gradio_app,
         path="",
         app_kwargs={
             "docs_url": "/docs",
@@ -138,31 +128,25 @@ if _modal_available:
 # ==================== Main Entry Point ====================
-def run_local():
-    """Run local server with uvicorn"""
-    print("🏠 Starting in local mode")
-    # Set default environment
-    os.environ.setdefault("DEPLOYMENT_MODE", "local")
-    app = create_app()
-    # Use port 7860 for HF Spaces compatibility
-    port = int(os.environ.get("PORT", 7860))
-    print(f"🌐 Starting server on port {port}")
-    # For HF Spaces with Gradio app, launch directly
-    if os.environ.get("HF_SPACES_MODE") == "1" and hasattr(app, 'launch'):
-        print("🤗 Launching Gradio app for HF Spaces")
-        app.launch(
-            server_name="0.0.0.0",
-            server_port=port,
-            share=False,
-            show_error=True
-        )
     else:
-        # For other environments, use uvicorn
         uvicorn.run(
             app,
             host="0.0.0.0",
@@ -170,5 +154,17 @@ def run_local():
             reload=False
         )
 if __name__ == "__main__":
     run_local()

 # ==================== Application Creation Function ====================
 def create_app():
+    """Create and return complete Gradio + MCP application"""
     print("🚀 Starting Gradio + FastMCP server")
     ):
         return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
+    # Create FastAPI wrapper
     fastapi_wrapper = FastAPI(
         title="Modal AudioTranscriber MCP",
         description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
     # Mount FastMCP application to /api path
     fastapi_wrapper.mount("/api", mcp_app)
+    # Create Gradio interface
+    ui_app = create_gradio_interface()
     # Use Gradio's standard mounting approach
     final_app = mount_gradio_app(
         app=fastapi_wrapper,
+        blocks=ui_app,
         path="",
         app_kwargs={
             "docs_url": "/docs",
 # ==================== Main Entry Point ====================
+def main():
+    """Main entry point for all deployment modes"""
+    if is_modal_mode():
+        print("☁️ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
+        return None
     else:
+        print("🏠 Starting in local mode")
+        print("💡 GPU functions will be routed to Modal endpoints")
+        app = create_app()
+        return app
+def run_local():
+    """Run local server with uvicorn (for direct execution)"""
+    app = main()
+    if app:
+        # Use port 7860 for HF Spaces compatibility, 8000 for local
+        port = int(os.environ.get("PORT", 7860))  # HF Spaces uses port 7860
         uvicorn.run(
             app,
             host="0.0.0.0",
             reload=False
         )
+# ==================== Hugging Face Spaces Support ====================
+# For Hugging Face Spaces, directly create the app
+def get_app():
+    """Get app instance for HF Spaces"""
+    if "DEPLOYMENT_MODE" not in os.environ:
+        os.environ["DEPLOYMENT_MODE"] = "local"
+    return main()
+# Create app for HF Spaces when imported
+app = get_app()  # Always create app for HF Spaces
 if __name__ == "__main__":
     run_local()

src/services/__pycache__/distributed_transcription_service.cpython-310.pyc CHANGED Viewed

Binary files a/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc and b/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc differ

src/services/distributed_transcription_service.py CHANGED Viewed

@@ -598,32 +598,54 @@ class DistributedTranscriptionService:
             known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
             unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
-            print(f"📊 Segment distribution:")
-            print(f"   Known speakers: {len(known_speaker_segments)} segments")
-            print(f"   Unknown speakers: {len(unknown_speaker_segments)} segments (will be filtered)")
-            # Generate output files (excluding UNKNOWN speakers)
             output_files = self._generate_output_files(
-                known_speaker_segments,  # Only include segments with known speakers
                 output_format,
-                enable_speaker_diarization
             )
             # Collect speaker information based on filtered segments
             speaker_info = self._collect_speaker_information_from_segments(
-                known_speaker_segments, enable_speaker_diarization
             )
             # Determine language (use most common language from chunks)
             languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
             most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
-            # Combine text from known speaker segments only
-            full_text = " ".join([seg.get("text", "").strip() for seg in known_speaker_segments if seg.get("text", "").strip()])
             print(f"🔗 merge_chunk_results completion summary:")
             print(f"   Total segments collected: {len(all_segments)}")
-            print(f"   Known speaker segments: {len(known_speaker_segments)}")
             print(f"   Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
             print(f"   Final text length: {len(full_text)} characters")
             print(f"   Language detected: {most_common_language}")
@@ -634,9 +656,9 @@ class DistributedTranscriptionService:
                 "txt_file_path": output_files.get("txt_file_path"),
                 "srt_file_path": output_files.get("srt_file_path"),
                 "audio_duration": total_duration,
-                "segment_count": len(known_speaker_segments),  # Count only known speaker segments
-                "total_segments_collected": len(all_segments),  # Total including UNKNOWN
-                "unknown_segments_filtered": len(unknown_speaker_segments),  # UNKNOWN segments count
                 "language_detected": most_common_language,
                 "model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
                 "distributed_processing": True,
@@ -644,8 +666,8 @@ class DistributedTranscriptionService:
                 "chunks_failed": len(failed_chunks),
                 "speaker_diarization_enabled": enable_speaker_diarization,
                 "speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
-                "text": full_text,  # Add full text for client-side file saving (filtered)
-                "segments": known_speaker_segments,  # Add segments for client-side file saving (filtered)
                 **speaker_info
             }
@@ -665,9 +687,9 @@ class DistributedTranscriptionService:
         self,
         segments: List[Dict],
         output_format: str,
-        enable_speaker_diarization: bool
     ) -> Dict[str, str]:
-        """Generate output files from merged segments (excluding UNKNOWN speakers)"""
         try:
             # Create output directory
             output_dir = Path(self.cache_dir) / "transcribe"
@@ -685,9 +707,15 @@ class DistributedTranscriptionService:
                 text = segment.get("text", "").strip()
                 speaker = segment.get("speaker", "UNKNOWN")
-                # Skip segments with no text or UNKNOWN speaker
-                if text and speaker != "UNKNOWN":
-                    valid_segments.append(segment)
             print(f"📝 Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
@@ -696,7 +724,7 @@ class DistributedTranscriptionService:
             with open(txt_path, "w", encoding="utf-8") as f:
                 for segment in valid_segments:
                     text = segment.get("text", "").strip()
-                    if enable_speaker_diarization and "speaker" in segment:
                         f.write(f"[{segment['speaker']}] {text}\n")
                     else:
                         f.write(f"{text}\n")
@@ -712,7 +740,7 @@ class DistributedTranscriptionService:
                         end_time = self._format_srt_time(segment.get("end", 0))
                         text = segment.get("text", "").strip()
-                        if enable_speaker_diarization and "speaker" in segment:
                             text = f"[{segment['speaker']}] {text}"
                         f.write(f"{srt_index}\n")

             known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
             unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
+            # Only filter UNKNOWN speakers if:
+            # 1. Speaker diarization is enabled, AND
+            # 2. There are some known speakers (meaning diarization was successful)
+            should_filter_unknown = enable_speaker_diarization and len(known_speaker_segments) > 0
+            if should_filter_unknown:
+                print(f"📊 Segment distribution (diarization enabled, filtering UNKNOWN):")
+                print(f"   Known speakers: {len(known_speaker_segments)} segments")
+                print(f"   Unknown speakers: {len(unknown_speaker_segments)} segments (will be filtered)")
+                # Use only known speaker segments
+                segments_for_output = known_speaker_segments
+            else:
+                # When diarization is disabled OR no speakers were successfully identified,
+                # use all segments regardless of speaker label
+                if enable_speaker_diarization:
+                    print(f"📊 Segment distribution (diarization enabled, but no speakers identified):")
+                    print(f"   All segments: {len(all_segments)} segments (no speaker filtering - diarization failed)")
+                else:
+                    print(f"📊 Segment distribution (diarization disabled):")
+                    print(f"   All segments: {len(all_segments)} segments (no speaker filtering)")
+                # Use all segments
+                segments_for_output = all_segments
+                unknown_speaker_segments = []  # Don't count as filtered if we're not filtering
+            # Generate output files
             output_files = self._generate_output_files(
+                segments_for_output,
                 output_format,
+                should_filter_unknown
             )
             # Collect speaker information based on filtered segments
             speaker_info = self._collect_speaker_information_from_segments(
+                segments_for_output, enable_speaker_diarization
             )
             # Determine language (use most common language from chunks)
             languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
             most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
+            # Combine text from segments used for output
+            full_text = " ".join([seg.get("text", "").strip() for seg in segments_for_output if seg.get("text", "").strip()])
             print(f"🔗 merge_chunk_results completion summary:")
             print(f"   Total segments collected: {len(all_segments)}")
+            print(f"   Output segments: {len(segments_for_output)}")
             print(f"   Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
             print(f"   Final text length: {len(full_text)} characters")
             print(f"   Language detected: {most_common_language}")
                 "txt_file_path": output_files.get("txt_file_path"),
                 "srt_file_path": output_files.get("srt_file_path"),
                 "audio_duration": total_duration,
+                "segment_count": len(segments_for_output),  # Count segments used for output
+                "total_segments_collected": len(all_segments),  # Total including any filtered segments
+                "unknown_segments_filtered": len(unknown_speaker_segments),  # UNKNOWN segments count (0 if diarization disabled)
                 "language_detected": most_common_language,
                 "model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
                 "distributed_processing": True,
                 "chunks_failed": len(failed_chunks),
                 "speaker_diarization_enabled": enable_speaker_diarization,
                 "speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
+                "text": full_text,  # Add full text for client-side file saving
+                "segments": segments_for_output,  # Add segments for client-side file saving
                 **speaker_info
             }
         self,
         segments: List[Dict],
         output_format: str,
+        should_filter_unknown: bool
     ) -> Dict[str, str]:
+        """Generate output files from merged segments (filter UNKNOWN speakers only if should_filter_unknown is True)"""
         try:
             # Create output directory
             output_dir = Path(self.cache_dir) / "transcribe"
                 text = segment.get("text", "").strip()
                 speaker = segment.get("speaker", "UNKNOWN")
+                # Skip segments with no text
+                if not text:
+                    continue
+                # Only skip UNKNOWN speakers if filtering is enabled
+                if should_filter_unknown and speaker == "UNKNOWN":
+                    continue
+                valid_segments.append(segment)
             print(f"📝 Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
             with open(txt_path, "w", encoding="utf-8") as f:
                 for segment in valid_segments:
                     text = segment.get("text", "").strip()
+                    if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
                         f.write(f"[{segment['speaker']}] {text}\n")
                     else:
                         f.write(f"{text}\n")
                         end_time = self._format_srt_time(segment.get("end", 0))
                         text = segment.get("text", "").strip()
+                        if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
                             text = f"[{segment['speaker']}] {text}"
                         f.write(f"{srt_index}\n")