Spaces:

FFZG-cleopatra
/

latvian-twitter-sentiment-classifier

Build error

App Files Files Community

thak123 commited on May 11, 2023

Commit

de2bdfb

1 Parent(s): 3cc0045

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -4

app.py CHANGED Viewed

@@ -4,21 +4,56 @@ import sys
 import dataset
 import engine
 from model import BERTBaseUncased
-from tokenizer import tokenizer
 import config
 from transformers import pipeline, AutoTokenizer, AutoModel
 import gradio as gr
 device = config.device
 model = BERTBaseUncased()
 model.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(device)),strict=False)
 model.to(device)
-T = tokenizer.TweetTokenizer(
-    preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
 def preprocess(text):
-    tokens = T.tokenize(text)
     print(tokens, file=sys.stderr)
     ptokens = []
     for index, token in enumerate(tokens):

 import dataset
 import engine
 from model import BERTBaseUncased
 import config
 from transformers import pipeline, AutoTokenizer, AutoModel
 import gradio as gr
+from ekphrasis.classes.preprocessor import TextPreProcessor
+from ekphrasis.classes.tokenizer import SocialTokenizer
+from ekphrasis.dicts.emoticons import emoticons
 device = config.device
 model = BERTBaseUncased()
 model.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(device)),strict=False)
 model.to(device)
+text_processor = TextPreProcessor(
+    # terms that will be normalized
+    normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
+    # terms that will be annotated
+    annotate={},
+    fix_html=True,  # fix HTML tokens
+    # corpus from which the word statistics are going to be used
+    # for word segmentation
+    segmenter="twitter",
+    # corpus from which the word statistics are going to be used
+    # for spell correction
+    corrector="twitter",
+    unpack_hashtags=False,  # perform word segmentation on hashtags
+    unpack_contractions=False,  # Unpack contractions (can't -> can not)
+    spell_correct_elong=False,  # spell correction for elongated words
+    # select a tokenizer. You can use SocialTokenizer, or pass your own
+    # the tokenizer, should take as input a string and return a list of tokens
+    tokenizer=SocialTokenizer(lowercase=True).tokenize,
+    # list of dictionaries, for replacing tokens extracted from the text,
+    # with other expressions. You can pass more than one dictionaries.
+    dicts=[]
+)
+# T = tokenizer.TweetTokenizer(
+#     preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
 def preprocess(text):
+    # tokens = T.tokenize(text)
+    tokens = text_processor.pre_process_docs(text)
     print(tokens, file=sys.stderr)
     ptokens = []
     for index, token in enumerate(tokens):