File size: 23,724 Bytes
e1bae5b
4c0be85
e368f8b
fb79caf
e368f8b
 
 
 
 
8cc69ea
0427f41
8cc69ea
fb79caf
e368f8b
 
 
 
 
 
fb79caf
e368f8b
e8ce33d
e368f8b
e8ce33d
 
 
 
 
 
 
 
 
 
 
 
e368f8b
e8ce33d
 
 
 
341746e
e368f8b
e8ce33d
 
 
 
 
 
 
 
 
e368f8b
e8ce33d
e368f8b
 
 
b815c4a
e8ce33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e368f8b
 
 
 
 
 
 
 
 
 
 
 
f427fe9
8cc69ea
1a539e2
e368f8b
 
0427f41
3fc92f2
 
0427f41
 
 
e368f8b
0427f41
 
 
e368f8b
0427f41
 
 
 
e368f8b
0427f41
 
 
 
 
 
 
 
 
 
 
 
2caaec7
 
 
6d2ca12
8cc69ea
1a539e2
f4644ed
e368f8b
 
1a539e2
e368f8b
1a539e2
e368f8b
 
1a539e2
ef745e1
f890d9b
ef745e1
e368f8b
 
 
 
6d2ca12
8cc69ea
1a539e2
e368f8b
 
 
 
 
 
0427f41
 
c1cd1f5
e368f8b
ef745e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e499054
9eda364
e368f8b
 
 
 
 
 
 
7f5deab
8cc69ea
e368f8b
 
8cc69ea
e368f8b
 
 
 
 
 
 
 
470c6fe
 
 
e368f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470c6fe
 
7fc8342
f4644ed
e368f8b
 
470c6fe
 
e368f8b
 
 
 
 
9eda364
e368f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f5deab
e368f8b
375457e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import os
import json
import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import traceback
import sys
from audio_processing import AudioProcessor
import spaces 
from chunkedTranscriber import ChunkedTranscriber


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def load_qa_model():
    """Load question-answering model with support for long input contexts."""
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN"))
        tokenizer.model_max_length = 8192  # Ensure the tokenizer can handle 8192 tokens
        
        # Load the model
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            rope_scaling={
                "type": "dynamic",  # Ensure compatibility with long contexts
                "factor": 8.0
            },
            use_auth_token=os.getenv("HF_TOKEN")
        )
        
        # Load the pipeline
        qa_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=4096,  # Adjust as needed for your use case
        )
        
        return qa_pipeline

    except Exception as e:
        logger.error(f"Failed to load Q&A model: {str(e)}")
        return None

# def load_qa_model():
#     """Load question-answering model"""
#     try:
#         model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
#         qa_pipeline = pipeline(
#             "text-generation",
#             model="hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
#             model_kwargs={"torch_dtype": torch.bfloat16},
#             device_map="auto",
#             use_auth_token=os.getenv("HF_TOKEN")
#         )
#         return qa_pipeline
#     except Exception as e:
#         logger.error(f"Failed to load Q&A model: {str(e)}")
#         return None

def load_summarization_model():
    """Load summarization model"""
    try:
        summarizer = pipeline(
            "summarization", 
            model="sshleifer/distilbart-cnn-12-6",
            device=0 if torch.cuda.is_available() else -1
        )
        return summarizer
    except Exception as e:
        logger.error(f"Failed to load summarization model: {str(e)}")
        return None


@spaces.GPU(duration=120)
def process_audio(audio_file, translate=False):
    """Process audio file"""
    transcriber = ChunkedTranscriber(chunk_size=5, overlap=1)
    _translation, _output = transcriber.transcribe_audio(audio_file, translate=True)
    return _translation, _output
    # try:
    #     processor = AudioProcessor()
    #     language_segments, final_segments = processor.process_audio(audio_file, translate)
        
    #     # Format output
    #     transcription = ""
    #     full_text = ""
        
    #     # Add language detection information
    #     for segment in language_segments:
    #         transcription += f"Language: {segment['language']}\n"
    #         transcription += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
        
    #     # Add transcription/translation information
    #     transcription += "Transcription with language detection:\n\n"
    #     for segment in final_segments:
    #         transcription += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}):\n"
    #         transcription += f"Original: {segment['text']}\n"
    #         if translate and 'translated' in segment:
    #             transcription += f"Translated: {segment['translated']}\n"
    #             full_text += segment['translated'] + " "
    #         else:
    #             full_text += segment['text'] + " "
    #         transcription += "\n"
    #     return transcription, full_text
    # except Exception as e:
    #     logger.error(f"Audio processing failed: {str(e)}")
    #     raise gr.Error(f"Processing failed: {str(e)}")


@spaces.GPU(duration=120)
def summarize_text(text):
    """Summarize text"""
    try:
        
        summarizer = load_summarization_model()
        
        if summarizer is None:
            return "Summarization model could not be loaded."
        logger.info("Successfully loaded summarization Model")
        # logger.info(f"\n\n {text}\n")

        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)}")
        return "Error occurred during summarization."


@spaces.GPU(duration=120)
def answer_question(context, question):
    """Answer questions about the text"""
    try:
        qa_pipeline = load_qa_model()
        if qa_pipeline is None:
            return "Q&A model could not be loaded."
        if not question : 
            return "Please enter your Question"

        messages = [
            # {"role": "system", "content": "You are a helpful assistant who can answer questions based on the given context."},
            {"role":"system", "content": """
            Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document.

            The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text.
            
            # Input Format Overview
            
            Word-Level Timestamps Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>"
            ```
            Example:
            ```
            0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed"
            0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with"
            0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution"
            ```
            
            Optional Sentence-Level Structure Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>"
            ```
            Example with Sentence Grouping:
            ```
            0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution."
            0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready."
            ```
            
            # Intelligence Summary Document Structure
            
            Use the format below to create a structured summary for each conversation transcript received:
            
            ### 1. Top-Level Status & Assessment:
            - **Threat Level Assessment**:
              - Choose one:
                - Completely Innocuous
                - Likely Innocuous
                - Unclear β€” Requires Investigation
                - Likely Dangerous β€” Immediate Action
                - Likely Dangerous β€” Delayed Action
                - 100% Dangerous β€” Immediate Action
                - 100% Dangerous β€” Delayed Action
            - **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help.
            
            ### 2. Basic Metadata:
            - **Number of Speakers**: Total and unique speakers detected.
            - **Languages**: List of languages used, with indication of who spoke which language.
            - **Location**: Actual or inferred locations of participants.
            - **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation).
            
            ### 3. Conversation Overview:
            - **Summary**: Concise breakdown of the main points and context.
            - **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc.
            - **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion.
            
            ### 4. In-Depth Analysis:
            - **Network Connections**: Identify mentions of additional individuals or groups involved.
            - **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension.
            - **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination.
            - **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning.
            - **Geolocation References**: Point out any inferences regarding regional language or place names.
            - **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism.
            
            ### 5. Resource Mentions & Operational Logistics:
            - **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics.
            - **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation.
            
            ### 6. Prioritization, Recommendations & Actionables:
            - **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention.
            - **Recommended Actions**:
              - **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected.
              - **Intervention**: Recommend intervention for urgent/high-risk cases.
              - **Humanitarian Assistance**: Suggest immediate support for any signs of distress.
              - **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings.
            
            # Steps
            
            1. Analyze the input conversation for participant information and context.
            2. Fill in each section of the Intelligence Summary Document structure.
            3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly.
            
            # Output Format
            
            Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON.
            
            # JSON Format Example:
            ```json
            {
              "Top-Level Status & Assessment": {
                "Threat Level Assessment": "Unclear - Requires Investigation",
                "Humanitarian Alert": "No distress signals detected."
              },
              "Basic Metadata": {
                "Number of Speakers": 2,
                "Languages": {
                  "Speaker 1": "English",
                  "Speaker 2": "English"
                },
                "Location": "Unknown",
                "Communication Medium": "Direct conversation"
              },
              "Conversation Overview": {
                "Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.",
                "Alarming Keywords": [],
                "Suspicious or Cryptic Phrases": []
              },
              "In-Depth Analysis": {
                "Network Connections": "None identified",
                "Intent & Emotional Tone Detection": "Calm, precautionary tone",
                "Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance",
                "Code Words & Cryptic Language": [],
                "Geolocation References": [],
                "Sentiment on Strategic Issues": "No radical or dissenting sentiment detected"
              },
              "Resource Mentions & Operational Logistics": {
                "Resource & Asset Mentions": [],
                "Behavioral Deviations": "None noted"
              },
              "Prioritization, Recommendations & Actionables": {
                "High-Risk Alert Priority": "Low",
                "Recommended Actions": {
                  "Surveillance": "No further surveillance needed.",
                  "Intervention": "Not required.",
                  "Humanitarian Assistance": "Not required.",
                  "Follow-Up Analysis": "No unusual phrases detected requiring review."
                }
              }
            }
            ```
            
            # Notes
            
            - Ensure that you mark any ambiguous segments as requiring further investigation.
            - Pay attention to emotional tone shifts or sudden changes in behavior.  
            - If any direct or implied threat is detected, prioritize appropriately using the provided classifications.
            - Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it.
            Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document.
            
            The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text.
            
            # Input Format Overview
            
            Word-Level Timestamps Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>"
            ```
            Example:
            ```
            0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed"
            0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with"
            0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution"
            ```
            
            Optional Sentence-Level Structure Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>"
            ```
            Example with Sentence Grouping:
            ```
            0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution."
            0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready."
            ```
            
            # Intelligence Summary Document Structure
            
            Use the format below to create a structured summary for each conversation transcript received:
            
            ### 1. Top-Level Status & Assessment:
            - **Threat Level Assessment**:
              - Choose one:
                - Completely Innocuous
                - Likely Innocuous
                - Unclear β€” Requires Investigation
                - Likely Dangerous β€” Immediate Action
                - Likely Dangerous β€” Delayed Action
                - 100% Dangerous β€” Immediate Action
                - 100% Dangerous β€” Delayed Action
            - **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help.
            
            ### 2. Basic Metadata:
            - **Number of Speakers**: Total and unique speakers detected.
            - **Languages**: List of languages used, with indication of who spoke which language.
            - **Location**: Actual or inferred locations of participants.
            - **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation).
            
            ### 3. Conversation Overview:
            - **Summary**: Concise breakdown of the main points and context.
            - **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc.
            - **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion.
            
            ### 4. In-Depth Analysis:
            - **Network Connections**: Identify mentions of additional individuals or groups involved.
            - **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension.
            - **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination.
            - **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning.
            - **Geolocation References**: Point out any inferences regarding regional language or place names.
            - **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism.
            
            ### 5. Resource Mentions & Operational Logistics:
            - **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics.
            - **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation.
            
            ### 6. Prioritization, Recommendations & Actionables:
            - **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention.
            - **Recommended Actions**:
              - **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected.
              - **Intervention**: Recommend intervention for urgent/high-risk cases.
              - **Humanitarian Assistance**: Suggest immediate support for any signs of distress.
              - **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings.
            
            # Steps
            
            1. Analyze the input conversation for participant information and context.
            2. Fill in each section of the Intelligence Summary Document structure.
            3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly.
            
            # Output Format
            
            Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON.
            
            # JSON Format Example:
            ```json
            {
              "Top-Level Status & Assessment": {
                "Threat Level Assessment": "Unclear - Requires Investigation",
                "Humanitarian Alert": "No distress signals detected."
              },
              "Basic Metadata": {
                "Number of Speakers": 2,
                "Languages": {
                  "Speaker 1": "English",
                  "Speaker 2": "English"
                },
                "Location": "Unknown",
                "Communication Medium": "Direct conversation"
              },
              "Conversation Overview": {
                "Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.",
                "Alarming Keywords": [],
                "Suspicious or Cryptic Phrases": []
              },
              "In-Depth Analysis": {
                "Network Connections": "None identified",
                "Intent & Emotional Tone Detection": "Calm, precautionary tone",
                "Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance",
                "Code Words & Cryptic Language": [],
                "Geolocation References": [],
                "Sentiment on Strategic Issues": "No radical or dissenting sentiment detected"
              },
              "Resource Mentions & Operational Logistics": {
                "Resource & Asset Mentions": [],
                "Behavioral Deviations": "None noted"
              },
              "Prioritization, Recommendations & Actionables": {
                "High-Risk Alert Priority": "Low",
                "Recommended Actions": {
                  "Surveillance": "No further surveillance needed.",
                  "Intervention": "Not required.",
                  "Humanitarian Assistance": "Not required.",
                  "Follow-Up Analysis": "No unusual phrases detected requiring review."
                }
              }
            }
            ```
            
            # Notes
            
            - Ensure that you mark any ambiguous segments as requiring further investigation.
            - Pay attention to emotional tone shifts or sudden changes in behavior.  
            - If any direct or implied threat is detected, prioritize appropriately using the provided classifications.
            - Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it.

            """},
            {"role": "user", "content": f"Context: {text}\n\nQuestion: {question}"}
        ]
        
        response = qa_pipeline(messages, max_new_tokens=256)[0]['generated_text']
        return response
    except Exception as e:
        logger.error(f"Q&A failed: {str(e)}")
        return f"Error occurred during Q&A process: {str(e)}"


# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Automatic Speech Recognition for Indic Languages")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath")
            translate_checkbox = gr.Checkbox(label="Enable Translation")
            process_button = gr.Button("Process Audio")
        
        with gr.Column():
            # ASR_RESULT = gr.Textbox(label="Output")
            full_text_output = gr.Textbox(label="Full Text", lines=5)
            translation_output = gr.Textbox(label="Transcription/Translation", lines=10)
    
    with gr.Row():
        with gr.Column():
            summarize_button = gr.Button("Summarize")
            summary_output = gr.Textbox(label="Summary", lines=3)
            
        with gr.Column():
            question_input = gr.Textbox(label="Ask a question about the transcription")
            answer_button = gr.Button("Get Answer")
            answer_output = gr.Textbox(label="Answer", lines=3)
    
    # Set up event handlers
    process_button.click(
        process_audio,
        inputs=[audio_input, translate_checkbox],
        outputs=[translation_output, full_text_output]
        # outputs=[ASR_RESULT]
    )
    # translated_text = ''.join(item['translated'] for item in ASR_RESULT if 'translated' in item)
    summarize_button.click(
        summarize_text,
        # inputs=[ASR_RESULT],
        inputs=[translation_output],
        outputs=[summary_output]
    )
    
    answer_button.click(
        answer_question,
        inputs=[full_text_output, question_input],
        outputs=[answer_output]
    )
    
    # Add system information
    gr.Markdown(f"""
    ## System Information
    - Device: {"CUDA" if torch.cuda.is_available() else "CPU"}
    - CUDA Available: {"Yes" if torch.cuda.is_available() else "No"}
    
    ## Features
    - Automatic language detection
    - High-quality transcription using MMS
    - Optional translation to English
    - Text summarization
    - Question answering
    """)

if __name__ == "__main__":
    iface.launch(server_port=None)