File size: 3,056 Bytes
835b4d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5708132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835b4d7
 
 
5708132
835b4d7
c4dc8e0
835b4d7
c4dc8e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import joblib

import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import streamlit as st 


# Preprocess function
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Remove URLs
    url_pattern = re.compile(r'https?://\S+')
    text = url_pattern.sub(' ', text)

    # Remove HTML Tags
    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(' ', text)

    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U0001F1F2-\U0001F1F4"
        u"\U0001F1E6-\U0001F1FF"
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]

    # Join tokens back into text
    text = ' '.join(tokens)

    return text

# Main function
model_NB_path = './model_NB.sav'
model_NB = joblib.load(model_NB_path)

model_LR_path = './model_LR.sav'
model_LR = joblib.load(model_LR_path)

def sentiment_analysis_LR(input):
    # Assuming you have a Logistic Regression model and TfidfVectorizer in the pipeline
    input = preprocess_text(input)

    vectorizer = model_LR.named_steps['tfidfvectorizer']
    lr_classifier = model_LR.named_steps['logisticregression']

    # Transform the user input using the TF-IDF vectorizer
    user_input_tfidf = vectorizer.transform([input])

    # Make predictions
    user_pred = lr_classifier.predict(user_input_tfidf)

    # Display the prediction
    if user_pred[0] == 0:
        return 0
    else:
        return 1

def sentiment_analysis_NB(input):
  input = preprocess_text(input)

  vectorizer = model_NB.named_steps['tfidf']
  nb_classifier = model_NB.named_steps['nb']

  # Transform the user input using the TF-IDF vectorizer
  user_input_tfidf = vectorizer.transform([input])

  # Make predictions
  user_pred = nb_classifier.predict(user_input_tfidf)

  # Display the prediction
  if user_pred[0] == 0:
      return 0
  else:
      return 1 

text = st.text_area('Enter some text !!! (English text : D )')
if text:
    out = sentiment_analysis_LR(text)
    if out == 0:
        st.write('The sentence is negative')
    else:
        st.write('The sentence is positive')