Spaces:

Lap1official
/

API

Running

API

File size: 7,164 Bytes

import gradio as gr
from huggingface_hub import InferenceClient
from deep_translator import GoogleTranslator
from indic_transliteration import sanscript
from indic_transliteration.detect import detect as detect_script
from indic_transliteration.sanscript import transliterate
import langdetect
import re

client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

def detect_language_script(text: str) -> tuple[str, str]:
    """
    Detect language and script of the input text.
    Returns (language_code, script_type)
    """
    try:
        # Use confidence threshold to avoid false detections
        lang_detect = langdetect.detect_langs(text)
        if lang_detect[0].prob > 0.8:  # Only accept high confidence detections
            lang = lang_detect[0].lang
        else:
            lang = 'en'  # Default to English if unsure
            
        script = None
        try:
            script = detect_script(text)
        except:
            pass
        return lang, script
    except:
        return 'en', None

def is_romanized_indic(text: str) -> bool:
    """
    Check if text appears to be romanized Indic language.
    More strict pattern matching.
    """
    # Common Bengali romanized patterns with word boundaries
    bengali_patterns = [
        r'\b(ami|tumi|apni)\b',  # Common pronouns
        r'\b(ache|achen|thako|thaken)\b',  # Common verbs
        r'\b(kemon|bhalo|kharap)\b',  # Common adjectives
        r'\b(ki|kothay|keno)\b'  # Common question words
    ]
    
    # Require multiple matches to confirm it's actually Bengali
    text_lower = text.lower()
    matches = sum(1 for pattern in bengali_patterns if re.search(pattern, text_lower))
    return matches >= 2  # Require at least 2 matches to consider it Bengali

def translate_text(text: str, target_lang='en') -> tuple[str, str, bool]:
    """
    Translate text to target language, with more conservative translation logic.
    """
    # Skip translation for very short inputs or basic greetings
    if len(text.split()) <= 2 or text.lower() in ['hello', 'hi', 'hey']:
        return text, 'en', False
        
    original_lang, script = detect_language_script(text)
    is_transliterated = False
    
    # Only process if confident it's non-English
    if original_lang != 'en' and len(text.split()) > 2:
        try:
            translator = GoogleTranslator(source='auto', target=target_lang)
            translated = translator.translate(text)
            return translated, original_lang, is_transliterated
        except Exception as e:
            print(f"Translation error: {e}")
            return text, 'en', False
            
    # Check for romanized Indic text only if it's a longer input
    if original_lang == 'en' and len(text.split()) > 2 and is_romanized_indic(text):
        text = romanized_to_bengali(text)
        return translate_text(text, target_lang)  # Recursive call with Bengali script
            
    return text, 'en', False

def check_custom_responses(message: str) -> str:
    """Check for specific patterns and return custom responses."""
    message_lower = message.lower()
    custom_responses = {
        "what is ur name?": "xylaria",
        "what is your name?": "xylaria",
        "what's your name?": "xylaria",
        "whats your name": "xylaria",
        "how many 'r' is in strawberry?": "3",
        "who is your developer?": "sk md saad amin",
        "how many r is in strawberry": "3",
        "who is ur dev": "sk md saad amin",
        "who is ur developer": "sk md saad amin",
    }
    
    for pattern, response in custom_responses.items():
        if pattern in message_lower:
            return response
    return None

def romanized_to_bengali(text: str) -> str:
    """Convert romanized Bengali text to Bengali script."""
    bengali_mappings = {
        'ami': 'আমি',
        'tumi': 'তুমি',
        'apni': 'আপনি',
        'kemon': 'কেমন',
        'achen': 'আছেন',
        'acchen': 'আছেন',
        'bhalo': 'ভালো',
        'achi': 'আছি',
        'ki': 'কি',
        'kothay': 'কোথায়',
        'keno': 'কেন',
    }
    
    text_lower = text.lower()
    for roman, bengali in bengali_mappings.items():
        text_lower = re.sub(r'\b' + roman + r'\b', bengali, text_lower)
    
    if text_lower == text.lower():
        try:
            return transliterate(text, sanscript.ITRANS, sanscript.BENGALI)
        except:
            return text
            
    return text_lower

def respond(
    message, 
    history: list[tuple[str, str]], 
    system_message, 
    max_tokens, 
    temperature, 
    top_p,
):
    # First check for custom responses
    custom_response = check_custom_responses(message)
    if custom_response:
        yield custom_response
        return

    # Handle translation with more conservative approach
    translated_msg, original_lang, was_transliterated = translate_text(message)
    
    # Prepare conversation history - only translate if necessary
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            # Only translate longer messages
            if len(val[0].split()) > 2:
                trans_user_msg, _, _ = translate_text(val[0])
                messages.append({"role": "user", "content": trans_user_msg})
            else:
                messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": translated_msg})
    
    # Get response from model
    response = ""
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        
        # Only translate back if the original was definitely non-English
        if original_lang != 'en' and len(message.split()) > 2:
            try:
                translator = GoogleTranslator(source='en', target=original_lang)
                translated_response = translator.translate(response)
                yield translated_response
            except:
                yield response
        else:
            yield response

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a friendly Chatbot who always responds in English unless the user specifically uses another language.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ]
)

if __name__ == "__main__":
    demo.launch(share=True)