Soufianesejjari commited on
Commit
6f00151
·
1 Parent(s): 3d1c35c

remove word less than 4

Browse files
Files changed (1) hide show
  1. preprocessor.py +3 -3
preprocessor.py CHANGED
@@ -39,10 +39,10 @@ def preprocess(data):
39
 
40
  def clean_message(message):
41
  # Remove messages containing '<Media ...>'
42
- if '<Media' in message:
43
  return ''
44
- # Remove words with less than 3 characters
45
- words = [word for word in message.split() if len(word) >= 3]
46
  # Remove stopwords
47
  words = [word for word in words if word.lower() not in combined_stop_words]
48
  return ' '.join(words)
 
39
 
40
  def clean_message(message):
41
  # Remove messages containing '<Media ...>'
42
+ if '<Médias omis>' in message:
43
  return ''
44
+ # Remove words with less than 4 characters
45
+ words = [word for word in message.split() if len(word) >= 4]
46
  # Remove stopwords
47
  words = [word for word in words if word.lower() not in combined_stop_words]
48
  return ' '.join(words)