Spaces:
Runtime error
Runtime error
Hector Lopez
commited on
Commit
·
c5b702e
1
Parent(s):
c6d3bd0
Upload application logic
Browse files- app.py +36 -0
- backend.py +44 -0
- requirements.txt +3 -0
- tweet_scraper.py +48 -0
app.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio Twitter analizer application.
|
3 |
+
|
4 |
+
This module provides a gradio-based web application
|
5 |
+
for the Twitter analyzer project.
|
6 |
+
"""
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from tweet_scraper import retrieve_tweet_text
|
10 |
+
from backend import predict_positivity
|
11 |
+
|
12 |
+
|
13 |
+
def process_tweet(url: str) -> str:
|
14 |
+
"""
|
15 |
+
Get a tweet's positivity.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
url (str): Tweet's URL.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
str: Predicted positivity
|
22 |
+
"""
|
23 |
+
text = retrieve_tweet_text(url)
|
24 |
+
outcome = predict_positivity(text)
|
25 |
+
|
26 |
+
return outcome
|
27 |
+
|
28 |
+
|
29 |
+
app = gr.Interface(
|
30 |
+
fn=process_tweet,
|
31 |
+
inputs=gr.inputs.Textbox(lines=2, placeholder="Tweet url..."),
|
32 |
+
outputs="text",
|
33 |
+
)
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
app, local_url, share_url = app.launch()
|
backend.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Positivity predictor.
|
3 |
+
|
4 |
+
This module provides the functionality to predict
|
5 |
+
a tweet's positivity using a BERT model.
|
6 |
+
"""
|
7 |
+
import torch
|
8 |
+
from transformers import BertForSequenceClassification, BertTokenizer
|
9 |
+
|
10 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
11 |
+
model = BertForSequenceClassification.from_pretrained(
|
12 |
+
"bert-base-uncased",
|
13 |
+
num_labels=5,
|
14 |
+
output_attentions=False,
|
15 |
+
output_hidden_states=False,
|
16 |
+
local_files_only=True,
|
17 |
+
)
|
18 |
+
model.load_state_dict(torch.load("data/BERT_ft_epoch5.model"))
|
19 |
+
model.eval()
|
20 |
+
|
21 |
+
|
22 |
+
def predict_positivity(text: str) -> str:
|
23 |
+
"""
|
24 |
+
Predict the positivity of a given tweet.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
text (str): Tweet's text.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
str: Predicted positivity.
|
31 |
+
"""
|
32 |
+
label_dict = {
|
33 |
+
0: "Extremely Negative",
|
34 |
+
1: "Negative",
|
35 |
+
2: "Neutral",
|
36 |
+
3: "Positive",
|
37 |
+
4: "Extremely Positive",
|
38 |
+
}
|
39 |
+
encoded = tokenizer(text, return_tensors="pt")
|
40 |
+
logits = model(**encoded).logits
|
41 |
+
|
42 |
+
predicted_class_id = logits.argmax().item()
|
43 |
+
|
44 |
+
return label_dict[predicted_class_id]
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
gradio
|
tweet_scraper.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Twitter scraper.
|
3 |
+
|
4 |
+
This module provides the functionality to retrieve
|
5 |
+
a tweet's text given a tweet's URL.
|
6 |
+
"""
|
7 |
+
import re
|
8 |
+
|
9 |
+
import requests
|
10 |
+
|
11 |
+
|
12 |
+
def retrieve_tweet_text(tweet_url: str) -> str:
|
13 |
+
"""
|
14 |
+
Retrieve a tweet's text.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
tweet_url (url): Tweet's URL.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
str: Tweet's parsed text.
|
21 |
+
"""
|
22 |
+
# Get the url to retrieve tweet-related data
|
23 |
+
url = (
|
24 |
+
"https://publish.twitter.com/oembed?dnt=true",
|
25 |
+
f"&omit_script=true&url={tweet_url}",
|
26 |
+
)
|
27 |
+
url = str.join("", url)
|
28 |
+
|
29 |
+
# Get the raw html containing th tweet text
|
30 |
+
raw_html = requests.get(url).json()["html"]
|
31 |
+
# Remove links from text
|
32 |
+
pattern = r"<[a][^>]*>(.+?)</[a]>"
|
33 |
+
html = re.sub(pattern, "", raw_html)
|
34 |
+
|
35 |
+
# Remove the HTML tags from the text
|
36 |
+
text = [i.strip() for i in re.sub("<.*?>", "", html).splitlines() if i][0]
|
37 |
+
|
38 |
+
# If there is a picture, remove all the text after it
|
39 |
+
if "pic" in text:
|
40 |
+
idx = text.index("pic")
|
41 |
+
text = text[:idx]
|
42 |
+
# If there is no picture, the &mdash defines the tweet's
|
43 |
+
# end.
|
44 |
+
elif "&mdash" in text:
|
45 |
+
idx = text.index("&mdash")
|
46 |
+
text = text[:idx]
|
47 |
+
|
48 |
+
return text
|