terry-li-hm
commited on
Commit
·
a54fc2d
1
Parent(s):
31e1773
Update sv.py
Browse files
sv.py
CHANGED
@@ -92,32 +92,39 @@ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
|
|
92 |
|
93 |
|
94 |
def clean_and_emoji_annotate_speech(text):
|
|
|
95 |
def get_emoji(s, emoji_set):
|
96 |
return next((char for char in s if char in emoji_set), None)
|
97 |
|
|
|
98 |
def format_text_with_emojis(s):
|
|
|
99 |
sptk_dict = {sptk: s.count(sptk) for sptk in emoji_dict}
|
100 |
|
|
|
101 |
for sptk in emoji_dict:
|
102 |
s = s.replace(sptk, "")
|
103 |
|
|
|
104 |
emo = "<|NEUTRAL|>"
|
105 |
for e in emo_dict:
|
106 |
if sptk_dict.get(e, 0) > sptk_dict.get(emo, 0):
|
107 |
emo = e
|
108 |
|
|
|
109 |
s = (
|
110 |
"".join(event_dict[e] for e in event_dict if sptk_dict.get(e, 0) > 0)
|
111 |
+ s
|
112 |
+ emo_dict[emo]
|
113 |
)
|
114 |
|
|
|
115 |
for emoji in emo_set.union(event_set):
|
116 |
s = s.replace(f" {emoji}", emoji).replace(f"{emoji} ", emoji)
|
117 |
|
118 |
return s.strip()
|
119 |
|
120 |
-
# Replace special tags
|
121 |
text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
|
122 |
for lang, replacement in lang_dict.items():
|
123 |
text = text.replace(lang, replacement)
|
@@ -130,6 +137,7 @@ def clean_and_emoji_annotate_speech(text):
|
|
130 |
formatted_segments = []
|
131 |
prev_event = prev_emotion = None
|
132 |
|
|
|
133 |
for segment in segments:
|
134 |
if not segment:
|
135 |
continue
|
@@ -137,15 +145,18 @@ def clean_and_emoji_annotate_speech(text):
|
|
137 |
current_event = get_emoji(segment, event_set)
|
138 |
current_emotion = get_emoji(segment, emo_set)
|
139 |
|
|
|
140 |
if current_event is not None:
|
141 |
segment = segment[1:] if segment.startswith(current_event) else segment
|
142 |
|
|
|
143 |
if current_emotion is not None and current_emotion != prev_emotion:
|
144 |
segment = segment.replace(current_emotion, "") + current_emotion
|
145 |
|
146 |
formatted_segments.append(segment.strip())
|
147 |
prev_event, prev_emotion = current_event, current_emotion
|
148 |
|
|
|
149 |
result = " ".join(formatted_segments).replace("The.", "").strip()
|
150 |
return result
|
151 |
|
|
|
92 |
|
93 |
|
94 |
def clean_and_emoji_annotate_speech(text):
|
95 |
+
# Helper function to get the first emoji from a string that belongs to a given set
|
96 |
def get_emoji(s, emoji_set):
|
97 |
return next((char for char in s if char in emoji_set), None)
|
98 |
|
99 |
+
# Helper function to format text with emojis based on special tokens
|
100 |
def format_text_with_emojis(s):
|
101 |
+
# Count occurrences of special tokens
|
102 |
sptk_dict = {sptk: s.count(sptk) for sptk in emoji_dict}
|
103 |
|
104 |
+
# Remove all special tokens from the text
|
105 |
for sptk in emoji_dict:
|
106 |
s = s.replace(sptk, "")
|
107 |
|
108 |
+
# Determine the dominant emotion
|
109 |
emo = "<|NEUTRAL|>"
|
110 |
for e in emo_dict:
|
111 |
if sptk_dict.get(e, 0) > sptk_dict.get(emo, 0):
|
112 |
emo = e
|
113 |
|
114 |
+
# Add event emojis at the beginning and emotion emoji at the end
|
115 |
s = (
|
116 |
"".join(event_dict[e] for e in event_dict if sptk_dict.get(e, 0) > 0)
|
117 |
+ s
|
118 |
+ emo_dict[emo]
|
119 |
)
|
120 |
|
121 |
+
# Remove spaces around emojis
|
122 |
for emoji in emo_set.union(event_set):
|
123 |
s = s.replace(f" {emoji}", emoji).replace(f"{emoji} ", emoji)
|
124 |
|
125 |
return s.strip()
|
126 |
|
127 |
+
# Replace special tags and language markers
|
128 |
text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
|
129 |
for lang, replacement in lang_dict.items():
|
130 |
text = text.replace(lang, replacement)
|
|
|
137 |
formatted_segments = []
|
138 |
prev_event = prev_emotion = None
|
139 |
|
140 |
+
# Combine segments, avoiding duplicate emojis
|
141 |
for segment in segments:
|
142 |
if not segment:
|
143 |
continue
|
|
|
145 |
current_event = get_emoji(segment, event_set)
|
146 |
current_emotion = get_emoji(segment, emo_set)
|
147 |
|
148 |
+
# Remove leading event emoji if it's the same as the previous one
|
149 |
if current_event is not None:
|
150 |
segment = segment[1:] if segment.startswith(current_event) else segment
|
151 |
|
152 |
+
# Move emotion emoji to the end if it's different from the previous one
|
153 |
if current_emotion is not None and current_emotion != prev_emotion:
|
154 |
segment = segment.replace(current_emotion, "") + current_emotion
|
155 |
|
156 |
formatted_segments.append(segment.strip())
|
157 |
prev_event, prev_emotion = current_event, current_emotion
|
158 |
|
159 |
+
# Join segments and remove unnecessary "The." at the end
|
160 |
result = " ".join(formatted_segments).replace("The.", "").strip()
|
161 |
return result
|
162 |
|