guymorlan commited on
Commit
81a260a
·
verified ·
1 Parent(s): 0fbea17

Update translit.py

Browse files
Files changed (1) hide show
  1. translit.py +29 -11
translit.py CHANGED
@@ -94,12 +94,12 @@ arabic_to_english = {
94
  "ُ": "u",
95
  "ِ": "i",
96
  "،": ",",
97
- "ֹ": "o", # holam
98
- "ַ": "a", # patah
99
- "ִ": "i", # hiriq
100
  "ְ": "", # shva
101
- "ֻ": "u", # kubutz
102
- 'ֵ': "e",
103
  "ّ": "SHADDA" # shadda
104
  }
105
 
@@ -155,6 +155,7 @@ def reverse_holam_shadda_vav(input_string):
155
 
156
  return result
157
 
 
158
  def to_taatik(arabic):
159
  taatik = []
160
  for index, letter in enumerate(arabic):
@@ -174,7 +175,23 @@ def to_taatik(arabic):
174
  return reordered
175
 
176
 
177
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  def to_translit(arabic):
180
  translit = []
@@ -184,15 +201,16 @@ def to_translit(arabic):
184
  else:
185
  if arabic_to_english[letter] == "SHADDA":
186
  if translit[-1][0] in vowels:
187
- translit[-2][1] = translit[-2][1].upper()
 
188
  else:
189
- translit[-1][1] = translit[-1][1].upper()
190
-
 
191
  else:
192
  translit.append([letter, arabic_to_english[letter]])
193
 
194
- return "".join([x[1] for x in translit])
195
-
196
 
197
  # %%
198
 
 
94
  "ُ": "u",
95
  "ِ": "i",
96
  "،": ",",
97
+ "ֹ": "", # holam
98
+ "ַ": "", # patah
99
+ "ִ": "", # hiriq
100
  "ְ": "", # shva
101
+ "ֻ": "", # kubutz
102
+ 'ֵ': "" # tzere
103
  "ّ": "SHADDA" # shadda
104
  }
105
 
 
155
 
156
  return result
157
 
158
+
159
  def to_taatik(arabic):
160
  taatik = []
161
  for index, letter in enumerate(arabic):
 
175
  return reordered
176
 
177
 
178
+ def postprocess_arabic_transliteration(text):
179
+ # Step 1: Replace long vowels at the end of words
180
+ text = re.sub(r'([āīēūō])(\W*$|\W+)', lambda m: m.group(1).translate(str.maketrans('āīēūō', 'aieuo')) + m.group(2), text)
181
+
182
+ # Step 2: Convert 'iy' to 'ī', but keep 'iyy'
183
+ text = re.sub(r'iy(?!y)', 'ī', text)
184
+
185
+ # Step 3: Convert 'uw' to 'ū', but keep 'uww'
186
+ text = re.sub(r'uw(?!w)', 'ū', text)
187
+
188
+ # Step 4: Convert 'ay' to 'ē', but keep 'ayy'
189
+ text = re.sub(r'ay(?!y)', 'ē', text)
190
+
191
+ # Step 5: Convert 'aw' to 'ō', but keep 'aww'
192
+ text = re.sub(r'aw(?!w)', 'ō', text)
193
+
194
+ return text
195
 
196
  def to_translit(arabic):
197
  translit = []
 
201
  else:
202
  if arabic_to_english[letter] == "SHADDA":
203
  if translit[-1][0] in vowels:
204
+ #translit[-2][1] = translit[-2][1].upper()
205
+ translit[-2][1] = translit[-2][1] + translit[-2][1]
206
  else:
207
+ #translit[-1][1] = translit[-1][1].upper()
208
+ translit[-1][1] = translit[-1][1] + translit[-1][1]
209
+
210
  else:
211
  translit.append([letter, arabic_to_english[letter]])
212
 
213
+ return postprocess_arabic_transliteration("".join([x[1] for x in translit]))
 
214
 
215
  # %%
216