Joshua Lochner commited on
Commit
884d564
·
1 Parent(s): 90d506c

Remove zero-width spaces from text

Browse files
Files changed (1) hide show
  1. src/preprocess.py +13 -3
src/preprocess.py CHANGED
@@ -59,9 +59,19 @@ def parse_transcript_json(json_data, granularity):
59
 
60
  new_segments = []
61
  for seg in segments:
62
- text = seg['utf8'].replace('\n', ' ').replace(
63
- PROFANITY_RAW, PROFANITY_CONVERTED, # Needed for auto-generated transcripts
64
- ).strip()
 
 
 
 
 
 
 
 
 
 
65
  if not text:
66
  continue
67
 
 
59
 
60
  new_segments = []
61
  for seg in segments:
62
+ # Replace \n, \t, etc. with space
63
+ text = ' '.join(seg['utf8'].split())
64
+
65
+ # Remove zero-width spaces and strip trailing and leading whitespace
66
+ text = text.replace('\u200b', '').replace('\u200c', '').replace(
67
+ '\u200d', '').replace('\ufeff', '').strip()
68
+
69
+ # Alternatively,
70
+ # text = text.encode('ascii', 'ignore').decode()
71
+
72
+ # Needed for auto-generated transcripts
73
+ text = text.replace(PROFANITY_RAW, PROFANITY_CONVERTED)
74
+
75
  if not text:
76
  continue
77