hamdie commited on
Commit
b6b63c7
·
verified ·
1 Parent(s): 79a930e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -3
app.py CHANGED
@@ -4,9 +4,76 @@ import tensorflow as tf
4
 
5
  model = tf.saved_model.load('arabert_pretrained')
6
 
7
- def sentiment_analysis(text):
8
- prediction = model.predict(text)
9
- return prediction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  iface = gr.Interface(fn=sentiment_analysis, inputs="text", outputs="text")
12
  iface.launch()
 
4
 
5
  model = tf.saved_model.load('arabert_pretrained')
6
 
7
+
8
+
9
+ import pandas as pd
10
+ df = pd.read_csv('put\data_cleaned1.csv')
11
+
12
+
13
+ from transformers import TFAutoModel, AutoTokenizer
14
+ arabert_tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')
15
+
16
+
17
+
18
+ import pandas as pd
19
+
20
+ # Assuming your DataFrame is named 'df'
21
+ # Split the DataFrame into two parts: label=1 and label=0
22
+ label_1_df = df[df['data_labels'] == 1]
23
+ label_0_df = df[df['data_labels'] == 0]
24
+
25
+ # Sample an equal number of rows from each label
26
+ sample_size = min(len(label_1_df), len(label_0_df))
27
+ sample_label_1 = label_1_df.sample(n=sample_size, random_state=42)
28
+ sample_label_0 = label_0_df.sample(n=sample_size, random_state=42)
29
+
30
+ # Concatenate the two samples to get the final balanced sample
31
+ balanced_sample = pd.concat([sample_label_1, sample_label_0])
32
+
33
+ # Shuffle the rows in the balanced sample
34
+ balanced_sample = balanced_sample.sample(frac=1, random_state=42)
35
+
36
+
37
+ balanced_sample.reset_index(inplace=True,drop=True)
38
+
39
+
40
+ from sklearn.model_selection import train_test_split
41
+
42
+ tweets = balanced_sample['cleaned_text']
43
+ labels = balanced_sample['data_labels']
44
+
45
+ X_train, X_test, y_train, y_test = train_test_split(tweets, labels,stratify=labels, test_size=0.15, random_state=1)
46
+ def preprocess_input_data(texts, tokenizer, max_len=120):
47
+ """Tokenize and preprocess the input data for Arabert model.
48
+
49
+ Args:
50
+ texts (list): List of text strings.
51
+ tokenizer (AutoTokenizer): Arabert tokenizer from transformers library.
52
+ max_len (int, optional): Maximum sequence length. Defaults to 120.
53
+
54
+ Returns:
55
+ Tuple of numpy arrays: Input token IDs and attention masks.
56
+ """
57
+ # Tokenize the text data using the tokenizer
58
+ tokenized_data = [tokenizer.encode_plus(
59
+ t,
60
+ max_length=max_len,
61
+ pad_to_max_length=True,
62
+ add_special_tokens=True) for t in texts]
63
+
64
+ # Extract tokenized input IDs and attention masks
65
+ input_ids = [data['input_ids'] for data in tokenized_data]
66
+ attention_mask = [data['attention_mask'] for data in tokenized_data]
67
+
68
+ return input_ids, attention_mask
69
+ def sentiment_analysis(text):
70
+ X_input_ids, X_attention_mask = preprocess_input_data(text, arabert_tokenizer)
71
+ predictions = modelez(X_input_ids)
72
+ a=predictions.numpy()
73
+ return a
74
+
75
+ import gradio as gr
76
+
77
 
78
  iface = gr.Interface(fn=sentiment_analysis, inputs="text", outputs="text")
79
  iface.launch()