terry-li-hm commited on
Commit
a54fc2d
·
1 Parent(s): 31e1773

Update sv.py

Browse files
Files changed (1) hide show
  1. sv.py +12 -1
sv.py CHANGED
@@ -92,32 +92,39 @@ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
92
 
93
 
94
  def clean_and_emoji_annotate_speech(text):
 
95
  def get_emoji(s, emoji_set):
96
  return next((char for char in s if char in emoji_set), None)
97
 
 
98
  def format_text_with_emojis(s):
 
99
  sptk_dict = {sptk: s.count(sptk) for sptk in emoji_dict}
100
 
 
101
  for sptk in emoji_dict:
102
  s = s.replace(sptk, "")
103
 
 
104
  emo = "<|NEUTRAL|>"
105
  for e in emo_dict:
106
  if sptk_dict.get(e, 0) > sptk_dict.get(emo, 0):
107
  emo = e
108
 
 
109
  s = (
110
  "".join(event_dict[e] for e in event_dict if sptk_dict.get(e, 0) > 0)
111
  + s
112
  + emo_dict[emo]
113
  )
114
 
 
115
  for emoji in emo_set.union(event_set):
116
  s = s.replace(f" {emoji}", emoji).replace(f"{emoji} ", emoji)
117
 
118
  return s.strip()
119
 
120
- # Replace special tags
121
  text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
122
  for lang, replacement in lang_dict.items():
123
  text = text.replace(lang, replacement)
@@ -130,6 +137,7 @@ def clean_and_emoji_annotate_speech(text):
130
  formatted_segments = []
131
  prev_event = prev_emotion = None
132
 
 
133
  for segment in segments:
134
  if not segment:
135
  continue
@@ -137,15 +145,18 @@ def clean_and_emoji_annotate_speech(text):
137
  current_event = get_emoji(segment, event_set)
138
  current_emotion = get_emoji(segment, emo_set)
139
 
 
140
  if current_event is not None:
141
  segment = segment[1:] if segment.startswith(current_event) else segment
142
 
 
143
  if current_emotion is not None and current_emotion != prev_emotion:
144
  segment = segment.replace(current_emotion, "") + current_emotion
145
 
146
  formatted_segments.append(segment.strip())
147
  prev_event, prev_emotion = current_event, current_emotion
148
 
 
149
  result = " ".join(formatted_segments).replace("The.", "").strip()
150
  return result
151
 
 
92
 
93
 
94
  def clean_and_emoji_annotate_speech(text):
95
+ # Helper function to get the first emoji from a string that belongs to a given set
96
  def get_emoji(s, emoji_set):
97
  return next((char for char in s if char in emoji_set), None)
98
 
99
+ # Helper function to format text with emojis based on special tokens
100
  def format_text_with_emojis(s):
101
+ # Count occurrences of special tokens
102
  sptk_dict = {sptk: s.count(sptk) for sptk in emoji_dict}
103
 
104
+ # Remove all special tokens from the text
105
  for sptk in emoji_dict:
106
  s = s.replace(sptk, "")
107
 
108
+ # Determine the dominant emotion
109
  emo = "<|NEUTRAL|>"
110
  for e in emo_dict:
111
  if sptk_dict.get(e, 0) > sptk_dict.get(emo, 0):
112
  emo = e
113
 
114
+ # Add event emojis at the beginning and emotion emoji at the end
115
  s = (
116
  "".join(event_dict[e] for e in event_dict if sptk_dict.get(e, 0) > 0)
117
  + s
118
  + emo_dict[emo]
119
  )
120
 
121
+ # Remove spaces around emojis
122
  for emoji in emo_set.union(event_set):
123
  s = s.replace(f" {emoji}", emoji).replace(f"{emoji} ", emoji)
124
 
125
  return s.strip()
126
 
127
+ # Replace special tags and language markers
128
  text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
129
  for lang, replacement in lang_dict.items():
130
  text = text.replace(lang, replacement)
 
137
  formatted_segments = []
138
  prev_event = prev_emotion = None
139
 
140
+ # Combine segments, avoiding duplicate emojis
141
  for segment in segments:
142
  if not segment:
143
  continue
 
145
  current_event = get_emoji(segment, event_set)
146
  current_emotion = get_emoji(segment, emo_set)
147
 
148
+ # Remove leading event emoji if it's the same as the previous one
149
  if current_event is not None:
150
  segment = segment[1:] if segment.startswith(current_event) else segment
151
 
152
+ # Move emotion emoji to the end if it's different from the previous one
153
  if current_emotion is not None and current_emotion != prev_emotion:
154
  segment = segment.replace(current_emotion, "") + current_emotion
155
 
156
  formatted_segments.append(segment.strip())
157
  prev_event, prev_emotion = current_event, current_emotion
158
 
159
+ # Join segments and remove unnecessary "The." at the end
160
  result = " ".join(formatted_segments).replace("The.", "").strip()
161
  return result
162