Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
884d564
1
Parent(s):
90d506c
Remove zero-width spaces from text
Browse files- src/preprocess.py +13 -3
src/preprocess.py
CHANGED
@@ -59,9 +59,19 @@ def parse_transcript_json(json_data, granularity):
|
|
59 |
|
60 |
new_segments = []
|
61 |
for seg in segments:
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
if not text:
|
66 |
continue
|
67 |
|
|
|
59 |
|
60 |
new_segments = []
|
61 |
for seg in segments:
|
62 |
+
# Replace \n, \t, etc. with space
|
63 |
+
text = ' '.join(seg['utf8'].split())
|
64 |
+
|
65 |
+
# Remove zero-width spaces and strip trailing and leading whitespace
|
66 |
+
text = text.replace('\u200b', '').replace('\u200c', '').replace(
|
67 |
+
'\u200d', '').replace('\ufeff', '').strip()
|
68 |
+
|
69 |
+
# Alternatively,
|
70 |
+
# text = text.encode('ascii', 'ignore').decode()
|
71 |
+
|
72 |
+
# Needed for auto-generated transcripts
|
73 |
+
text = text.replace(PROFANITY_RAW, PROFANITY_CONVERTED)
|
74 |
+
|
75 |
if not text:
|
76 |
continue
|
77 |
|