Spaces:

non2013
/

SincereQuestions

Sleeping

non2013 commited on Oct 20, 2024

Commit

ca3c933

1 Parent(s): f5b14b4

edit preprocess

Files changed (1) hide show

app.py CHANGED Viewed

@@ -28,10 +28,6 @@ with open('lemma_dict.pkl', 'rb') as f:
 nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
 nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
-OOV_INDEX = 0
-word_dict = {"<OOV>": OOV_INDEX}  # OOV token at index 0.
-word_index = 1
 def preprocess_text(text):
     """Preprocess the input text using SpaCy and return word indices."""
     docs = nlp.pipe([text], n_process=1)
@@ -40,7 +36,7 @@ def preprocess_text(text):
         for token in doc:
             if token.pos_ != "PUNCT":
                 if token.text not in word_dict:
-                    word_dict[token.text] = OOV_INDEX
                 word_seq.append(word_dict[token.text])
     return word_seq

 nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
 nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
 def preprocess_text(text):
     """Preprocess the input text using SpaCy and return word indices."""
     docs = nlp.pipe([text], n_process=1)
         for token in doc:
             if token.pos_ != "PUNCT":
                 if token.text not in word_dict:
+                    word_dict[token.text] = 0 # OOV_INDEX
                 word_seq.append(word_dict[token.text])
     return word_seq