ASRfr

Paused

File size: 23,724 Bytes

import os
import json
import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import traceback
import sys
from audio_processing import AudioProcessor
import spaces 
from chunkedTranscriber import ChunkedTranscriber


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def load_qa_model():
    """Load question-answering model with support for long input contexts."""
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN"))
        tokenizer.model_max_length = 8192  # Ensure the tokenizer can handle 8192 tokens
        
        # Load the model
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            rope_scaling={
                "type": "dynamic",  # Ensure compatibility with long contexts
                "factor": 8.0
            },
            use_auth_token=os.getenv("HF_TOKEN")
        )
        
        # Load the pipeline
        qa_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=4096,  # Adjust as needed for your use case
        )
        
        return qa_pipeline

    except Exception as e:
        logger.error(f"Failed to load Q&A model: {str(e)}")
        return None

# def load_qa_model():
#     """Load question-answering model"""
#     try:
#         model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
#         qa_pipeline = pipeline(
#             "text-generation",
#             model="hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
#             model_kwargs={"torch_dtype": torch.bfloat16},
#             device_map="auto",
#             use_auth_token=os.getenv("HF_TOKEN")
#         )
#         return qa_pipeline
#     except Exception as e:
#         logger.error(f"Failed to load Q&A model: {str(e)}")
#         return None

def load_summarization_model():
    """Load summarization model"""
    try:
        summarizer = pipeline(
            "summarization", 
            model="sshleifer/distilbart-cnn-12-6",
            device=0 if torch.cuda.is_available() else -1
        )
        return summarizer
    except Exception as e:
        logger.error(f"Failed to load summarization model: {str(e)}")
        return None


@spaces.GPU(duration=120)
def process_audio(audio_file, translate=False):
    """Process audio file"""
    transcriber = ChunkedTranscriber(chunk_size=5, overlap=1)
    _translation, _output = transcriber.transcribe_audio(audio_file, translate=True)
    return _translation, _output
    # try:
    #     processor = AudioProcessor()
    #     language_segments, final_segments = processor.process_audio(audio_file, translate)
        
    #     # Format output
    #     transcription = ""
    #     full_text = ""
        
    #     # Add language detection information
    #     for segment in language_segments:
    #         transcription += f"Language: {segment['language']}\n"
    #         transcription += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
        
    #     # Add transcription/translation information
    #     transcription += "Transcription with language detection:\n\n"
    #     for segment in final_segments:
    #         transcription += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}):\n"
    #         transcription += f"Original: {segment['text']}\n"
    #         if translate and 'translated' in segment:
    #             transcription += f"Translated: {segment['translated']}\n"
    #             full_text += segment['translated'] + " "
    #         else:
    #             full_text += segment['text'] + " "
    #         transcription += "\n"
    #     return transcription, full_text
    # except Exception as e:
    #     logger.error(f"Audio processing failed: {str(e)}")
    #     raise gr.Error(f"Processing failed: {str(e)}")


@spaces.GPU(duration=120)
def summarize_text(text):
    """Summarize text"""
    try:
        
        summarizer = load_summarization_model()
        
        if summarizer is None:
            return "Summarization model could not be loaded."
        logger.info("Successfully loaded summarization Model")
        # logger.info(f"\n\n {text}\n")

        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)}")
        return "Error occurred during summarization."


@spaces.GPU(duration=120)
def answer_question(context, question):
    """Answer questions about the text"""
    try:
        qa_pipeline = load_qa_model()
        if qa_pipeline is None:
            return "Q&A model could not be loaded."
        if not question : 
            return "Please enter your Question"

        messages = [
            # {"role": "system", "content": "You are a helpful assistant who can answer questions based on the given context."},
            {"role":"system", "content": """
            Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document.

            The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text.
            
            # Input Format Overview
            
            Word-Level Timestamps Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>"
            ```
            Example:
            ```
            0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed"
            0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with"
            0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution"
            ```
            
            Optional Sentence-Level Structure Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>"
            ```
            Example with Sentence Grouping:
            ```
            0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution."
            0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready."
            ```
            
            # Intelligence Summary Document Structure
            
            Use the format below to create a structured summary for each conversation transcript received:
            
            ### 1. Top-Level Status & Assessment:
            - **Threat Level Assessment**:
              - Choose one:
                - Completely Innocuous
                - Likely Innocuous
                - Unclear — Requires Investigation
                - Likely Dangerous — Immediate Action
                - Likely Dangerous — Delayed Action
                - 100% Dangerous — Immediate Action
                - 100% Dangerous — Delayed Action
            - **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help.
            
            ### 2. Basic Metadata:
            - **Number of Speakers**: Total and unique speakers detected.
            - **Languages**: List of languages used, with indication of who spoke which language.
            - **Location**: Actual or inferred locations of participants.
            - **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation).
            
            ### 3. Conversation Overview:
            - **Summary**: Concise breakdown of the main points and context.
            - **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc.
            - **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion.
            
            ### 4. In-Depth Analysis:
            - **Network Connections**: Identify mentions of additional individuals or groups involved.
            - **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension.
            - **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination.
            - **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning.
            - **Geolocation References**: Point out any inferences regarding regional language or place names.
            - **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism.
            
            ### 5. Resource Mentions & Operational Logistics:
            - **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics.
            - **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation.
            
            ### 6. Prioritization, Recommendations & Actionables:
            - **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention.
            - **Recommended Actions**:
              - **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected.
              - **Intervention**: Recommend intervention for urgent/high-risk cases.
              - **Humanitarian Assistance**: Suggest immediate support for any signs of distress.
              - **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings.
            
            # Steps
            
            1. Analyze the input conversation for participant information and context.
            2. Fill in each section of the Intelligence Summary Document structure.
            3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly.
            
            # Output Format
            
            Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON.
            
            # JSON Format Example:
            ```json
            {
              "Top-Level Status & Assessment": {
                "Threat Level Assessment": "Unclear - Requires Investigation",
                "Humanitarian Alert": "No distress signals detected."
              },
              "Basic Metadata": {
                "Number of Speakers": 2,
                "Languages": {
                  "Speaker 1": "English",
                  "Speaker 2": "English"
                },
                "Location": "Unknown",
                "Communication Medium": "Direct conversation"
              },
              "Conversation Overview": {
                "Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.",
                "Alarming Keywords": [],
                "Suspicious or Cryptic Phrases": []
              },
              "In-Depth Analysis": {
                "Network Connections": "None identified",
                "Intent & Emotional Tone Detection": "Calm, precautionary tone",
                "Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance",
                "Code Words & Cryptic Language": [],
                "Geolocation References": [],
                "Sentiment on Strategic Issues": "No radical or dissenting sentiment detected"
              },
              "Resource Mentions & Operational Logistics": {
                "Resource & Asset Mentions": [],
                "Behavioral Deviations": "None noted"
              },
              "Prioritization, Recommendations & Actionables": {
                "High-Risk Alert Priority": "Low",
                "Recommended Actions": {
                  "Surveillance": "No further surveillance needed.",
                  "Intervention": "Not required.",
                  "Humanitarian Assistance": "Not required.",
                  "Follow-Up Analysis": "No unusual phrases detected requiring review."
                }
              }
            }
            ```
            
            # Notes
            
            - Ensure that you mark any ambiguous segments as requiring further investigation.
            - Pay attention to emotional tone shifts or sudden changes in behavior.  
            - If any direct or implied threat is detected, prioritize appropriately using the provided classifications.
            - Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it.
            Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document.
            
            The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text.
            
            # Input Format Overview
            
            Word-Level Timestamps Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>"
            ```
            Example:
            ```
            0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed"
            0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with"
            0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution"
            ```
            
            Optional Sentence-Level Structure Example:
            ```
            [Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>"
            ```
            Example with Sentence Grouping:
            ```
            0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution."
            0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready."
            ```
            
            # Intelligence Summary Document Structure
            
            Use the format below to create a structured summary for each conversation transcript received:
            
            ### 1. Top-Level Status & Assessment:
            - **Threat Level Assessment**:
              - Choose one:
                - Completely Innocuous
                - Likely Innocuous
                - Unclear — Requires Investigation
                - Likely Dangerous — Immediate Action
                - Likely Dangerous — Delayed Action
                - 100% Dangerous — Immediate Action
                - 100% Dangerous — Delayed Action
            - **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help.
            
            ### 2. Basic Metadata:
            - **Number of Speakers**: Total and unique speakers detected.
            - **Languages**: List of languages used, with indication of who spoke which language.
            - **Location**: Actual or inferred locations of participants.
            - **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation).
            
            ### 3. Conversation Overview:
            - **Summary**: Concise breakdown of the main points and context.
            - **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc.
            - **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion.
            
            ### 4. In-Depth Analysis:
            - **Network Connections**: Identify mentions of additional individuals or groups involved.
            - **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension.
            - **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination.
            - **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning.
            - **Geolocation References**: Point out any inferences regarding regional language or place names.
            - **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism.
            
            ### 5. Resource Mentions & Operational Logistics:
            - **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics.
            - **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation.
            
            ### 6. Prioritization, Recommendations & Actionables:
            - **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention.
            - **Recommended Actions**:
              - **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected.
              - **Intervention**: Recommend intervention for urgent/high-risk cases.
              - **Humanitarian Assistance**: Suggest immediate support for any signs of distress.
              - **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings.
            
            # Steps
            
            1. Analyze the input conversation for participant information and context.
            2. Fill in each section of the Intelligence Summary Document structure.
            3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly.
            
            # Output Format
            
            Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON.
            
            # JSON Format Example:
            ```json
            {
              "Top-Level Status & Assessment": {
                "Threat Level Assessment": "Unclear - Requires Investigation",
                "Humanitarian Alert": "No distress signals detected."
              },
              "Basic Metadata": {
                "Number of Speakers": 2,
                "Languages": {
                  "Speaker 1": "English",
                  "Speaker 2": "English"
                },
                "Location": "Unknown",
                "Communication Medium": "Direct conversation"
              },
              "Conversation Overview": {
                "Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.",
                "Alarming Keywords": [],
                "Suspicious or Cryptic Phrases": []
              },
              "In-Depth Analysis": {
                "Network Connections": "None identified",
                "Intent & Emotional Tone Detection": "Calm, precautionary tone",
                "Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance",
                "Code Words & Cryptic Language": [],
                "Geolocation References": [],
                "Sentiment on Strategic Issues": "No radical or dissenting sentiment detected"
              },
              "Resource Mentions & Operational Logistics": {
                "Resource & Asset Mentions": [],
                "Behavioral Deviations": "None noted"
              },
              "Prioritization, Recommendations & Actionables": {
                "High-Risk Alert Priority": "Low",
                "Recommended Actions": {
                  "Surveillance": "No further surveillance needed.",
                  "Intervention": "Not required.",
                  "Humanitarian Assistance": "Not required.",
                  "Follow-Up Analysis": "No unusual phrases detected requiring review."
                }
              }
            }
            ```
            
            # Notes
            
            - Ensure that you mark any ambiguous segments as requiring further investigation.
            - Pay attention to emotional tone shifts or sudden changes in behavior.  
            - If any direct or implied threat is detected, prioritize appropriately using the provided classifications.
            - Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it.

            """},
            {"role": "user", "content": f"Context: {text}\n\nQuestion: {question}"}
        ]
        
        response = qa_pipeline(messages, max_new_tokens=256)[0]['generated_text']
        return response
    except Exception as e:
        logger.error(f"Q&A failed: {str(e)}")
        return f"Error occurred during Q&A process: {str(e)}"


# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Automatic Speech Recognition for Indic Languages")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath")
            translate_checkbox = gr.Checkbox(label="Enable Translation")
            process_button = gr.Button("Process Audio")
        
        with gr.Column():
            # ASR_RESULT = gr.Textbox(label="Output")
            full_text_output = gr.Textbox(label="Full Text", lines=5)
            translation_output = gr.Textbox(label="Transcription/Translation", lines=10)
    
    with gr.Row():
        with gr.Column():
            summarize_button = gr.Button("Summarize")
            summary_output = gr.Textbox(label="Summary", lines=3)
            
        with gr.Column():
            question_input = gr.Textbox(label="Ask a question about the transcription")
            answer_button = gr.Button("Get Answer")
            answer_output = gr.Textbox(label="Answer", lines=3)
    
    # Set up event handlers
    process_button.click(
        process_audio,
        inputs=[audio_input, translate_checkbox],
        outputs=[translation_output, full_text_output]
        # outputs=[ASR_RESULT]
    )
    # translated_text = ''.join(item['translated'] for item in ASR_RESULT if 'translated' in item)
    summarize_button.click(
        summarize_text,
        # inputs=[ASR_RESULT],
        inputs=[translation_output],
        outputs=[summary_output]
    )
    
    answer_button.click(
        answer_question,
        inputs=[full_text_output, question_input],
        outputs=[answer_output]
    )
    
    # Add system information
    gr.Markdown(f"""
    ## System Information
    - Device: {"CUDA" if torch.cuda.is_available() else "CPU"}
    - CUDA Available: {"Yes" if torch.cuda.is_available() else "No"}
    
    ## Features
    - Automatic language detection
    - High-quality transcription using MMS
    - Optional translation to English
    - Text summarization
    - Question answering
    """)

if __name__ == "__main__":
    iface.launch(server_port=None)