non2013 commited on
Commit
ca3c933
·
1 Parent(s): f5b14b4

edit preprocess

Browse files
Files changed (1) hide show
  1. app.py +1 -5
app.py CHANGED
@@ -28,10 +28,6 @@ with open('lemma_dict.pkl', 'rb') as f:
28
  nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
29
  nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
30
 
31
- OOV_INDEX = 0
32
- word_dict = {"<OOV>": OOV_INDEX} # OOV token at index 0.
33
- word_index = 1
34
-
35
  def preprocess_text(text):
36
  """Preprocess the input text using SpaCy and return word indices."""
37
  docs = nlp.pipe([text], n_process=1)
@@ -40,7 +36,7 @@ def preprocess_text(text):
40
  for token in doc:
41
  if token.pos_ != "PUNCT":
42
  if token.text not in word_dict:
43
- word_dict[token.text] = OOV_INDEX
44
  word_seq.append(word_dict[token.text])
45
  return word_seq
46
 
 
28
  nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
29
  nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
30
 
 
 
 
 
31
  def preprocess_text(text):
32
  """Preprocess the input text using SpaCy and return word indices."""
33
  docs = nlp.pipe([text], n_process=1)
 
36
  for token in doc:
37
  if token.pos_ != "PUNCT":
38
  if token.text not in word_dict:
39
+ word_dict[token.text] = 0 # OOV_INDEX
40
  word_seq.append(word_dict[token.text])
41
  return word_seq
42