Upload folder using huggingface_hub
Browse files
src/app.py
CHANGED
@@ -24,7 +24,7 @@ except ImportError:
|
|
24 |
# ==================== Application Creation Function ====================
|
25 |
|
26 |
def create_app():
|
27 |
-
"""Create and return Gradio
|
28 |
|
29 |
print("π Starting Gradio + FastMCP server")
|
30 |
|
@@ -68,20 +68,7 @@ def create_app():
|
|
68 |
):
|
69 |
return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
|
70 |
|
71 |
-
# Create
|
72 |
-
print("π¨ Creating Gradio interface...")
|
73 |
-
gradio_app = create_gradio_interface()
|
74 |
-
|
75 |
-
# For HF Spaces, return Gradio app directly
|
76 |
-
if os.environ.get("HF_SPACES_MODE") == "1":
|
77 |
-
print("π€ HF Spaces mode: returning Gradio app directly")
|
78 |
-
print("β
Server startup completed")
|
79 |
-
print("π¨ Gradio UI: /")
|
80 |
-
print(f"π Server name: {mcp.name}")
|
81 |
-
return gradio_app
|
82 |
-
|
83 |
-
# For other environments, create FastAPI wrapper with MCP
|
84 |
-
print("π§ Creating FastAPI wrapper...")
|
85 |
fastapi_wrapper = FastAPI(
|
86 |
title="Modal AudioTranscriber MCP",
|
87 |
description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
|
@@ -95,10 +82,13 @@ def create_app():
|
|
95 |
# Mount FastMCP application to /api path
|
96 |
fastapi_wrapper.mount("/api", mcp_app)
|
97 |
|
|
|
|
|
|
|
98 |
# Use Gradio's standard mounting approach
|
99 |
final_app = mount_gradio_app(
|
100 |
app=fastapi_wrapper,
|
101 |
-
blocks=
|
102 |
path="",
|
103 |
app_kwargs={
|
104 |
"docs_url": "/docs",
|
@@ -138,31 +128,25 @@ if _modal_available:
|
|
138 |
|
139 |
# ==================== Main Entry Point ====================
|
140 |
|
141 |
-
def
|
142 |
-
"""
|
143 |
-
print("π Starting in local mode")
|
144 |
-
|
145 |
-
# Set default environment
|
146 |
-
os.environ.setdefault("DEPLOYMENT_MODE", "local")
|
147 |
-
|
148 |
-
app = create_app()
|
149 |
-
|
150 |
-
# Use port 7860 for HF Spaces compatibility
|
151 |
-
port = int(os.environ.get("PORT", 7860))
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
if os.environ.get("HF_SPACES_MODE") == "1" and hasattr(app, 'launch'):
|
157 |
-
print("π€ Launching Gradio app for HF Spaces")
|
158 |
-
app.launch(
|
159 |
-
server_name="0.0.0.0",
|
160 |
-
server_port=port,
|
161 |
-
share=False,
|
162 |
-
show_error=True
|
163 |
-
)
|
164 |
else:
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
uvicorn.run(
|
167 |
app,
|
168 |
host="0.0.0.0",
|
@@ -170,5 +154,17 @@ def run_local():
|
|
170 |
reload=False
|
171 |
)
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
if __name__ == "__main__":
|
174 |
run_local()
|
|
|
24 |
# ==================== Application Creation Function ====================
|
25 |
|
26 |
def create_app():
|
27 |
+
"""Create and return complete Gradio + MCP application"""
|
28 |
|
29 |
print("π Starting Gradio + FastMCP server")
|
30 |
|
|
|
68 |
):
|
69 |
return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
|
70 |
|
71 |
+
# Create FastAPI wrapper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
fastapi_wrapper = FastAPI(
|
73 |
title="Modal AudioTranscriber MCP",
|
74 |
description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
|
|
|
82 |
# Mount FastMCP application to /api path
|
83 |
fastapi_wrapper.mount("/api", mcp_app)
|
84 |
|
85 |
+
# Create Gradio interface
|
86 |
+
ui_app = create_gradio_interface()
|
87 |
+
|
88 |
# Use Gradio's standard mounting approach
|
89 |
final_app = mount_gradio_app(
|
90 |
app=fastapi_wrapper,
|
91 |
+
blocks=ui_app,
|
92 |
path="",
|
93 |
app_kwargs={
|
94 |
"docs_url": "/docs",
|
|
|
128 |
|
129 |
# ==================== Main Entry Point ====================
|
130 |
|
131 |
+
def main():
|
132 |
+
"""Main entry point for all deployment modes"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
if is_modal_mode():
|
135 |
+
print("βοΈ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
|
136 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
else:
|
138 |
+
print("π Starting in local mode")
|
139 |
+
print("π‘ GPU functions will be routed to Modal endpoints")
|
140 |
+
|
141 |
+
app = create_app()
|
142 |
+
return app
|
143 |
+
|
144 |
+
def run_local():
|
145 |
+
"""Run local server with uvicorn (for direct execution)"""
|
146 |
+
app = main()
|
147 |
+
if app:
|
148 |
+
# Use port 7860 for HF Spaces compatibility, 8000 for local
|
149 |
+
port = int(os.environ.get("PORT", 7860)) # HF Spaces uses port 7860
|
150 |
uvicorn.run(
|
151 |
app,
|
152 |
host="0.0.0.0",
|
|
|
154 |
reload=False
|
155 |
)
|
156 |
|
157 |
+
# ==================== Hugging Face Spaces Support ====================
|
158 |
+
|
159 |
+
# For Hugging Face Spaces, directly create the app
|
160 |
+
def get_app():
|
161 |
+
"""Get app instance for HF Spaces"""
|
162 |
+
if "DEPLOYMENT_MODE" not in os.environ:
|
163 |
+
os.environ["DEPLOYMENT_MODE"] = "local"
|
164 |
+
return main()
|
165 |
+
|
166 |
+
# Create app for HF Spaces when imported
|
167 |
+
app = get_app() # Always create app for HF Spaces
|
168 |
+
|
169 |
if __name__ == "__main__":
|
170 |
run_local()
|
src/services/__pycache__/distributed_transcription_service.cpython-310.pyc
CHANGED
Binary files a/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc and b/src/services/__pycache__/distributed_transcription_service.cpython-310.pyc differ
|
|
src/services/distributed_transcription_service.py
CHANGED
@@ -598,32 +598,54 @@ class DistributedTranscriptionService:
|
|
598 |
known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
|
599 |
unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
|
600 |
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
-
# Generate output files
|
606 |
output_files = self._generate_output_files(
|
607 |
-
|
608 |
output_format,
|
609 |
-
|
610 |
)
|
611 |
|
612 |
# Collect speaker information based on filtered segments
|
613 |
speaker_info = self._collect_speaker_information_from_segments(
|
614 |
-
|
615 |
)
|
616 |
|
617 |
# Determine language (use most common language from chunks)
|
618 |
languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
|
619 |
most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
|
620 |
|
621 |
-
# Combine text from
|
622 |
-
full_text = " ".join([seg.get("text", "").strip() for seg in
|
623 |
|
624 |
print(f"π merge_chunk_results completion summary:")
|
625 |
print(f" Total segments collected: {len(all_segments)}")
|
626 |
-
print(f"
|
627 |
print(f" Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
|
628 |
print(f" Final text length: {len(full_text)} characters")
|
629 |
print(f" Language detected: {most_common_language}")
|
@@ -634,9 +656,9 @@ class DistributedTranscriptionService:
|
|
634 |
"txt_file_path": output_files.get("txt_file_path"),
|
635 |
"srt_file_path": output_files.get("srt_file_path"),
|
636 |
"audio_duration": total_duration,
|
637 |
-
"segment_count": len(
|
638 |
-
"total_segments_collected": len(all_segments), # Total including
|
639 |
-
"unknown_segments_filtered": len(unknown_speaker_segments), # UNKNOWN segments count
|
640 |
"language_detected": most_common_language,
|
641 |
"model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
|
642 |
"distributed_processing": True,
|
@@ -644,8 +666,8 @@ class DistributedTranscriptionService:
|
|
644 |
"chunks_failed": len(failed_chunks),
|
645 |
"speaker_diarization_enabled": enable_speaker_diarization,
|
646 |
"speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
|
647 |
-
"text": full_text, # Add full text for client-side file saving
|
648 |
-
"segments":
|
649 |
**speaker_info
|
650 |
}
|
651 |
|
@@ -665,9 +687,9 @@ class DistributedTranscriptionService:
|
|
665 |
self,
|
666 |
segments: List[Dict],
|
667 |
output_format: str,
|
668 |
-
|
669 |
) -> Dict[str, str]:
|
670 |
-
"""Generate output files from merged segments (
|
671 |
try:
|
672 |
# Create output directory
|
673 |
output_dir = Path(self.cache_dir) / "transcribe"
|
@@ -685,9 +707,15 @@ class DistributedTranscriptionService:
|
|
685 |
text = segment.get("text", "").strip()
|
686 |
speaker = segment.get("speaker", "UNKNOWN")
|
687 |
|
688 |
-
# Skip segments with no text
|
689 |
-
if text
|
690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
|
692 |
print(f"π Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
|
693 |
|
@@ -696,7 +724,7 @@ class DistributedTranscriptionService:
|
|
696 |
with open(txt_path, "w", encoding="utf-8") as f:
|
697 |
for segment in valid_segments:
|
698 |
text = segment.get("text", "").strip()
|
699 |
-
if
|
700 |
f.write(f"[{segment['speaker']}] {text}\n")
|
701 |
else:
|
702 |
f.write(f"{text}\n")
|
@@ -712,7 +740,7 @@ class DistributedTranscriptionService:
|
|
712 |
end_time = self._format_srt_time(segment.get("end", 0))
|
713 |
text = segment.get("text", "").strip()
|
714 |
|
715 |
-
if
|
716 |
text = f"[{segment['speaker']}] {text}"
|
717 |
|
718 |
f.write(f"{srt_index}\n")
|
|
|
598 |
known_speaker_segments = [seg for seg in all_segments if seg["speaker"] != "UNKNOWN"]
|
599 |
unknown_speaker_segments = [seg for seg in all_segments if seg["speaker"] == "UNKNOWN"]
|
600 |
|
601 |
+
# Only filter UNKNOWN speakers if:
|
602 |
+
# 1. Speaker diarization is enabled, AND
|
603 |
+
# 2. There are some known speakers (meaning diarization was successful)
|
604 |
+
should_filter_unknown = enable_speaker_diarization and len(known_speaker_segments) > 0
|
605 |
+
|
606 |
+
if should_filter_unknown:
|
607 |
+
print(f"π Segment distribution (diarization enabled, filtering UNKNOWN):")
|
608 |
+
print(f" Known speakers: {len(known_speaker_segments)} segments")
|
609 |
+
print(f" Unknown speakers: {len(unknown_speaker_segments)} segments (will be filtered)")
|
610 |
+
|
611 |
+
# Use only known speaker segments
|
612 |
+
segments_for_output = known_speaker_segments
|
613 |
+
else:
|
614 |
+
# When diarization is disabled OR no speakers were successfully identified,
|
615 |
+
# use all segments regardless of speaker label
|
616 |
+
if enable_speaker_diarization:
|
617 |
+
print(f"π Segment distribution (diarization enabled, but no speakers identified):")
|
618 |
+
print(f" All segments: {len(all_segments)} segments (no speaker filtering - diarization failed)")
|
619 |
+
else:
|
620 |
+
print(f"π Segment distribution (diarization disabled):")
|
621 |
+
print(f" All segments: {len(all_segments)} segments (no speaker filtering)")
|
622 |
+
|
623 |
+
# Use all segments
|
624 |
+
segments_for_output = all_segments
|
625 |
+
unknown_speaker_segments = [] # Don't count as filtered if we're not filtering
|
626 |
|
627 |
+
# Generate output files
|
628 |
output_files = self._generate_output_files(
|
629 |
+
segments_for_output,
|
630 |
output_format,
|
631 |
+
should_filter_unknown
|
632 |
)
|
633 |
|
634 |
# Collect speaker information based on filtered segments
|
635 |
speaker_info = self._collect_speaker_information_from_segments(
|
636 |
+
segments_for_output, enable_speaker_diarization
|
637 |
)
|
638 |
|
639 |
# Determine language (use most common language from chunks)
|
640 |
languages = [chunk.get("language_detected", "unknown") for chunk in successful_chunks]
|
641 |
most_common_language = max(set(languages), key=languages.count) if languages else "unknown"
|
642 |
|
643 |
+
# Combine text from segments used for output
|
644 |
+
full_text = " ".join([seg.get("text", "").strip() for seg in segments_for_output if seg.get("text", "").strip()])
|
645 |
|
646 |
print(f"π merge_chunk_results completion summary:")
|
647 |
print(f" Total segments collected: {len(all_segments)}")
|
648 |
+
print(f" Output segments: {len(segments_for_output)}")
|
649 |
print(f" Unknown speaker segments filtered: {len(unknown_speaker_segments)}")
|
650 |
print(f" Final text length: {len(full_text)} characters")
|
651 |
print(f" Language detected: {most_common_language}")
|
|
|
656 |
"txt_file_path": output_files.get("txt_file_path"),
|
657 |
"srt_file_path": output_files.get("srt_file_path"),
|
658 |
"audio_duration": total_duration,
|
659 |
+
"segment_count": len(segments_for_output), # Count segments used for output
|
660 |
+
"total_segments_collected": len(all_segments), # Total including any filtered segments
|
661 |
+
"unknown_segments_filtered": len(unknown_speaker_segments), # UNKNOWN segments count (0 if diarization disabled)
|
662 |
"language_detected": most_common_language,
|
663 |
"model_used": successful_chunks[0].get("model_used", "turbo") if successful_chunks else "turbo",
|
664 |
"distributed_processing": True,
|
|
|
666 |
"chunks_failed": len(failed_chunks),
|
667 |
"speaker_diarization_enabled": enable_speaker_diarization,
|
668 |
"speaker_embedding_unified": len(speaker_mapping) > 0 if speaker_mapping else False,
|
669 |
+
"text": full_text, # Add full text for client-side file saving
|
670 |
+
"segments": segments_for_output, # Add segments for client-side file saving
|
671 |
**speaker_info
|
672 |
}
|
673 |
|
|
|
687 |
self,
|
688 |
segments: List[Dict],
|
689 |
output_format: str,
|
690 |
+
should_filter_unknown: bool
|
691 |
) -> Dict[str, str]:
|
692 |
+
"""Generate output files from merged segments (filter UNKNOWN speakers only if should_filter_unknown is True)"""
|
693 |
try:
|
694 |
# Create output directory
|
695 |
output_dir = Path(self.cache_dir) / "transcribe"
|
|
|
707 |
text = segment.get("text", "").strip()
|
708 |
speaker = segment.get("speaker", "UNKNOWN")
|
709 |
|
710 |
+
# Skip segments with no text
|
711 |
+
if not text:
|
712 |
+
continue
|
713 |
+
|
714 |
+
# Only skip UNKNOWN speakers if filtering is enabled
|
715 |
+
if should_filter_unknown and speaker == "UNKNOWN":
|
716 |
+
continue
|
717 |
+
|
718 |
+
valid_segments.append(segment)
|
719 |
|
720 |
print(f"π Generating output files with {len(valid_segments)} valid segments (filtered from {len(segments)} total)")
|
721 |
|
|
|
724 |
with open(txt_path, "w", encoding="utf-8") as f:
|
725 |
for segment in valid_segments:
|
726 |
text = segment.get("text", "").strip()
|
727 |
+
if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
|
728 |
f.write(f"[{segment['speaker']}] {text}\n")
|
729 |
else:
|
730 |
f.write(f"{text}\n")
|
|
|
740 |
end_time = self._format_srt_time(segment.get("end", 0))
|
741 |
text = segment.get("text", "").strip()
|
742 |
|
743 |
+
if should_filter_unknown and "speaker" in segment and segment["speaker"] != "UNKNOWN":
|
744 |
text = f"[{segment['speaker']}] {text}"
|
745 |
|
746 |
f.write(f"{srt_index}\n")
|