File size: 6,956 Bytes
ef37daa
 
305d245
 
 
 
 
 
ef37daa
 
f147126
305d245
f147126
305d245
 
f147126
 
305d245
 
 
 
 
 
 
f147126
305d245
f147126
305d245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f147126
305d245
f147126
 
305d245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef37daa
a387258
 
 
 
 
 
 
 
e4af908
 
 
 
 
a387258
 
 
 
 
 
464da3a
305d245
 
 
 
 
 
 
 
 
 
 
 
 
 
ef37daa
a387258
 
 
 
 
ef37daa
 
a387258
 
 
 
 
ef37daa
305d245
 
a387258
f147126
 
ef37daa
464da3a
305d245
 
f147126
464da3a
 
a387258
f147126
a387258
f147126
ef37daa
 
 
 
 
 
 
 
 
 
f147126
 
305d245
 
f147126
 
 
ef37daa
464da3a
 
 
a387258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef37daa
 
 
 
 
a387258
ef37daa
a387258
ef37daa
 
 
464da3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import gradio as gr
from huggingface_hub import InferenceClient
from deep_translator import GoogleTranslator
from indic_transliteration import sanscript
from indic_transliteration.detect import detect as detect_script
from indic_transliteration.sanscript import transliterate
import langdetect
import re

client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

def detect_language_script(text: str) -> tuple[str, str]:
    """
    Detect language and script of the input text.
    Returns (language_code, script_type)
    """
    try:
        lang = langdetect.detect(text)
        script = None
        try:
            script = detect_script(text)
        except:
            pass
        return lang, script
    except:
        return 'en', None

def is_romanized_indic(text: str) -> bool:
    """
    Check if text appears to be romanized Indic language.
    This is a basic implementation - you may want to enhance the patterns.
    """
    # Common Bengali romanized patterns
    bengali_patterns = [
        r'\b(ami|tumi|apni)\b',  # Common pronouns
        r'\b(ache|achen|thako|thaken)\b',  # Common verbs
        r'\b(kemon|bhalo|kharap)\b',  # Common adjectives
        r'\b(ki|kothay|keno)\b'  # Common question words
    ]
    
    text_lower = text.lower()
    return any(re.search(pattern, text_lower) for pattern in bengali_patterns)

def romanized_to_bengali(text: str) -> str:
    """
    Convert romanized Bengali text to Bengali script.
    """
    # Define common Bengali word mappings
    bengali_mappings = {
        'ami': 'আমি',
        'tumi': 'তুমি',
        'apni': 'আপনি',
        'kemon': 'কেমন',
        'achen': 'আছেন',
        'acchen': 'আছেন',
        'bhalo': 'ভালো',
        'achi': 'আছি',
        'ki': 'কি',
        'tumi': 'তুমি',
        'kothay': 'কোথায়',
        'keno': 'কেন',
        # Add more mappings as needed
    }
    
    # Convert to lowercase for matching
    text_lower = text.lower()
    
    # Replace words based on mappings
    for roman, bengali in bengali_mappings.items():
        text_lower = re.sub(r'\b' + roman + r'\b', bengali, text_lower)
    
    # If no direct mapping found, try using transliteration
    if text_lower == text.lower():
        try:
            return transliterate(text, sanscript.ITRANS, sanscript.BENGALI)
        except:
            return text
            
    return text_lower

def translate_text(text: str, target_lang='en') -> tuple[str, str, bool]:
    """
    Translate text to target language, handling both script and romanized text.
    Returns (translated_text, original_lang, is_transliterated)
    """
    original_lang, script = detect_language_script(text)
    is_transliterated = False
    
    # Handle potential romanized Indic text
    if original_lang == 'en' and is_romanized_indic(text):
        text = romanized_to_bengali(text)
        original_lang = 'bn'
        is_transliterated = True
    
    # Only translate if not already in target language
    if original_lang != target_lang:
        try:
            translator = GoogleTranslator(source='auto', target=target_lang)
            translated = translator.translate(text)
            return translated, original_lang, is_transliterated
        except Exception as e:
            print(f"Translation error: {e}")
            return text, original_lang, is_transliterated
            
    return text, original_lang, is_transliterated

def check_custom_responses(message: str) -> str:
    """Check for specific patterns and return custom responses."""
    message_lower = message.lower()
    custom_responses = {
        "what is ur name?": "xylaria",
        "what is your name?": "xylaria",
        "what's your name?": "xylaria",
        "whats your name": "xylaria",
        "how many 'r' is in strawberry?": "3",
        "who is your developer?": "sk md saad amin",
        "how many r is in strawberry": "3",
        "who is ur dev": "sk md saad amin",
        "who is ur developer": "sk md saad amin",
    }
    
    for pattern, response in custom_responses.items():
        if pattern in message_lower:
            return response
    return None

def translate_to_original(text: str, original_lang: str, was_transliterated: bool) -> str:
    """
    Translate response back to original language and script if needed.
    """
    if original_lang != 'en':
        try:
            translator = GoogleTranslator(source='en', target=original_lang)
            translated = translator.translate(text)
            return translated
        except Exception as e:
            print(f"Translation error: {e}")
            return text
    return text

def respond(
    message, 
    history: list[tuple[str, str]], 
    system_message, 
    max_tokens, 
    temperature, 
    top_p,
):
    # First check for custom responses
    custom_response = check_custom_responses(message)
    if custom_response:
        yield custom_response
        return

    # Handle translation and transliteration
    translated_msg, original_lang, was_transliterated = translate_text(message)
    
    # Prepare conversation history
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            # Translate user message from history
            trans_user_msg, _, _ = translate_text(val[0])
            messages.append({"role": "user", "content": trans_user_msg})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": translated_msg})
    
    # Get response from model
    response = ""
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        
        # Translate accumulated response if original message wasn't in English
        if original_lang != 'en':
            translated_response = translate_to_original(response, original_lang, was_transliterated)
            yield translated_response
        else:
            yield response

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a friendly Chatbot.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ]
)

if __name__ == "__main__":
    demo.launch(share=True)