Reality123b commited on
Commit
69bd0b3
·
verified ·
1 Parent(s): 2f75d7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -74
app.py CHANGED
@@ -15,7 +15,13 @@ def detect_language_script(text: str) -> tuple[str, str]:
15
  Returns (language_code, script_type)
16
  """
17
  try:
18
- lang = langdetect.detect(text)
 
 
 
 
 
 
19
  script = None
20
  try:
21
  script = detect_script(text)
@@ -28,9 +34,9 @@ def detect_language_script(text: str) -> tuple[str, str]:
28
  def is_romanized_indic(text: str) -> bool:
29
  """
30
  Check if text appears to be romanized Indic language.
31
- This is a basic implementation - you may want to enhance the patterns.
32
  """
33
- # Common Bengali romanized patterns
34
  bengali_patterns = [
35
  r'\b(ami|tumi|apni)\b', # Common pronouns
36
  r'\b(ache|achen|thako|thaken)\b', # Common verbs
@@ -38,71 +44,38 @@ def is_romanized_indic(text: str) -> bool:
38
  r'\b(ki|kothay|keno)\b' # Common question words
39
  ]
40
 
 
41
  text_lower = text.lower()
42
- return any(re.search(pattern, text_lower) for pattern in bengali_patterns)
43
-
44
- def romanized_to_bengali(text: str) -> str:
45
- """
46
- Convert romanized Bengali text to Bengali script.
47
- """
48
- # Define common Bengali word mappings
49
- bengali_mappings = {
50
- 'ami': 'আমি',
51
- 'tumi': 'তুমি',
52
- 'apni': 'আপনি',
53
- 'kemon': 'কেমন',
54
- 'achen': 'আছেন',
55
- 'acchen': 'আছেন',
56
- 'bhalo': 'ভালো',
57
- 'achi': 'আছি',
58
- 'ki': 'কি',
59
- 'tumi': 'তুমি',
60
- 'kothay': 'কোথায়',
61
- 'keno': 'কেন',
62
- # Add more mappings as needed
63
- }
64
-
65
- # Convert to lowercase for matching
66
- text_lower = text.lower()
67
-
68
- # Replace words based on mappings
69
- for roman, bengali in bengali_mappings.items():
70
- text_lower = re.sub(r'\b' + roman + r'\b', bengali, text_lower)
71
-
72
- # If no direct mapping found, try using transliteration
73
- if text_lower == text.lower():
74
- try:
75
- return transliterate(text, sanscript.ITRANS, sanscript.BENGALI)
76
- except:
77
- return text
78
-
79
- return text_lower
80
 
81
  def translate_text(text: str, target_lang='en') -> tuple[str, str, bool]:
82
  """
83
- Translate text to target language, handling both script and romanized text.
84
- Returns (translated_text, original_lang, is_transliterated)
85
  """
 
 
 
 
86
  original_lang, script = detect_language_script(text)
87
  is_transliterated = False
88
 
89
- # Handle potential romanized Indic text
90
- if original_lang == 'en' and is_romanized_indic(text):
91
- text = romanized_to_bengali(text)
92
- original_lang = 'bn'
93
- is_transliterated = True
94
-
95
- # Only translate if not already in target language
96
- if original_lang != target_lang:
97
  try:
98
  translator = GoogleTranslator(source='auto', target=target_lang)
99
  translated = translator.translate(text)
100
  return translated, original_lang, is_transliterated
101
  except Exception as e:
102
  print(f"Translation error: {e}")
103
- return text, original_lang, is_transliterated
104
 
105
- return text, original_lang, is_transliterated
 
 
 
 
 
106
 
107
  def check_custom_responses(message: str) -> str:
108
  """Check for specific patterns and return custom responses."""
@@ -124,19 +97,33 @@ def check_custom_responses(message: str) -> str:
124
  return response
125
  return None
126
 
127
- def translate_to_original(text: str, original_lang: str, was_transliterated: bool) -> str:
128
- """
129
- Translate response back to original language and script if needed.
130
- """
131
- if original_lang != 'en':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  try:
133
- translator = GoogleTranslator(source='en', target=original_lang)
134
- translated = translator.translate(text)
135
- return translated
136
- except Exception as e:
137
- print(f"Translation error: {e}")
138
  return text
139
- return text
 
140
 
141
  def respond(
142
  message,
@@ -152,16 +139,19 @@ def respond(
152
  yield custom_response
153
  return
154
 
155
- # Handle translation and transliteration
156
  translated_msg, original_lang, was_transliterated = translate_text(message)
157
 
158
- # Prepare conversation history
159
  messages = [{"role": "system", "content": system_message}]
160
  for val in history:
161
  if val[0]:
162
- # Translate user message from history
163
- trans_user_msg, _, _ = translate_text(val[0])
164
- messages.append({"role": "user", "content": trans_user_msg})
 
 
 
165
  if val[1]:
166
  messages.append({"role": "assistant", "content": val[1]})
167
 
@@ -179,10 +169,14 @@ def respond(
179
  token = message.choices[0].delta.content
180
  response += token
181
 
182
- # Translate accumulated response if original message wasn't in English
183
- if original_lang != 'en':
184
- translated_response = translate_to_original(response, original_lang, was_transliterated)
185
- yield translated_response
 
 
 
 
186
  else:
187
  yield response
188
 
@@ -190,7 +184,7 @@ demo = gr.ChatInterface(
190
  respond,
191
  additional_inputs=[
192
  gr.Textbox(
193
- value="You are a friendly Chatbot.",
194
  label="System message"
195
  ),
196
  gr.Slider(
 
15
  Returns (language_code, script_type)
16
  """
17
  try:
18
+ # Use confidence threshold to avoid false detections
19
+ lang_detect = langdetect.detect_langs(text)
20
+ if lang_detect[0].prob > 0.8: # Only accept high confidence detections
21
+ lang = lang_detect[0].lang
22
+ else:
23
+ lang = 'en' # Default to English if unsure
24
+
25
  script = None
26
  try:
27
  script = detect_script(text)
 
34
  def is_romanized_indic(text: str) -> bool:
35
  """
36
  Check if text appears to be romanized Indic language.
37
+ More strict pattern matching.
38
  """
39
+ # Common Bengali romanized patterns with word boundaries
40
  bengali_patterns = [
41
  r'\b(ami|tumi|apni)\b', # Common pronouns
42
  r'\b(ache|achen|thako|thaken)\b', # Common verbs
 
44
  r'\b(ki|kothay|keno)\b' # Common question words
45
  ]
46
 
47
+ # Require multiple matches to confirm it's actually Bengali
48
  text_lower = text.lower()
49
+ matches = sum(1 for pattern in bengali_patterns if re.search(pattern, text_lower))
50
+ return matches >= 2 # Require at least 2 matches to consider it Bengali
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def translate_text(text: str, target_lang='en') -> tuple[str, str, bool]:
53
  """
54
+ Translate text to target language, with more conservative translation logic.
 
55
  """
56
+ # Skip translation for very short inputs or basic greetings
57
+ if len(text.split()) <= 2 or text.lower() in ['hello', 'hi', 'hey']:
58
+ return text, 'en', False
59
+
60
  original_lang, script = detect_language_script(text)
61
  is_transliterated = False
62
 
63
+ # Only process if confident it's non-English
64
+ if original_lang != 'en' and len(text.split()) > 2:
 
 
 
 
 
 
65
  try:
66
  translator = GoogleTranslator(source='auto', target=target_lang)
67
  translated = translator.translate(text)
68
  return translated, original_lang, is_transliterated
69
  except Exception as e:
70
  print(f"Translation error: {e}")
71
+ return text, 'en', False
72
 
73
+ # Check for romanized Indic text only if it's a longer input
74
+ if original_lang == 'en' and len(text.split()) > 2 and is_romanized_indic(text):
75
+ text = romanized_to_bengali(text)
76
+ return translate_text(text, target_lang) # Recursive call with Bengali script
77
+
78
+ return text, 'en', False
79
 
80
  def check_custom_responses(message: str) -> str:
81
  """Check for specific patterns and return custom responses."""
 
97
  return response
98
  return None
99
 
100
+ def romanized_to_bengali(text: str) -> str:
101
+ """Convert romanized Bengali text to Bengali script."""
102
+ bengali_mappings = {
103
+ 'ami': 'আমি',
104
+ 'tumi': 'তুমি',
105
+ 'apni': 'আপনি',
106
+ 'kemon': 'কেমন',
107
+ 'achen': 'আছেন',
108
+ 'acchen': 'আছেন',
109
+ 'bhalo': 'ভালো',
110
+ 'achi': 'আছি',
111
+ 'ki': 'কি',
112
+ 'kothay': 'কোথায়',
113
+ 'keno': 'কেন',
114
+ }
115
+
116
+ text_lower = text.lower()
117
+ for roman, bengali in bengali_mappings.items():
118
+ text_lower = re.sub(r'\b' + roman + r'\b', bengali, text_lower)
119
+
120
+ if text_lower == text.lower():
121
  try:
122
+ return transliterate(text, sanscript.ITRANS, sanscript.BENGALI)
123
+ except:
 
 
 
124
  return text
125
+
126
+ return text_lower
127
 
128
  def respond(
129
  message,
 
139
  yield custom_response
140
  return
141
 
142
+ # Handle translation with more conservative approach
143
  translated_msg, original_lang, was_transliterated = translate_text(message)
144
 
145
+ # Prepare conversation history - only translate if necessary
146
  messages = [{"role": "system", "content": system_message}]
147
  for val in history:
148
  if val[0]:
149
+ # Only translate longer messages
150
+ if len(val[0].split()) > 2:
151
+ trans_user_msg, _, _ = translate_text(val[0])
152
+ messages.append({"role": "user", "content": trans_user_msg})
153
+ else:
154
+ messages.append({"role": "user", "content": val[0]})
155
  if val[1]:
156
  messages.append({"role": "assistant", "content": val[1]})
157
 
 
169
  token = message.choices[0].delta.content
170
  response += token
171
 
172
+ # Only translate back if the original was definitely non-English
173
+ if original_lang != 'en' and len(message.split()) > 2:
174
+ try:
175
+ translator = GoogleTranslator(source='en', target=original_lang)
176
+ translated_response = translator.translate(response)
177
+ yield translated_response
178
+ except:
179
+ yield response
180
  else:
181
  yield response
182
 
 
184
  respond,
185
  additional_inputs=[
186
  gr.Textbox(
187
+ value="You are a friendly Chatbot who always responds in English unless the user specifically uses another language.",
188
  label="System message"
189
  ),
190
  gr.Slider(