Spaces:
Sleeping
Sleeping
Sam Chaudry
commited on
Commit
·
35311c2
1
Parent(s):
0ed4968
Optimisations
Browse files- media_trust.py +27 -10
media_trust.py
CHANGED
|
@@ -5,10 +5,15 @@ import datetime
|
|
| 5 |
import nltk
|
| 6 |
from datetime import datetime, timedelta
|
| 7 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from transformers import pipeline
|
| 11 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
|
|
|
| 12 |
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
import os
|
|
@@ -60,14 +65,14 @@ def query(topic, sort_by="popularity", max_tokens=100):
|
|
| 60 |
return None
|
| 61 |
|
| 62 |
today = datetime.today()
|
| 63 |
-
last_week = today - timedelta(days=
|
| 64 |
from_date = last_week.strftime('%Y-%m-%d')
|
| 65 |
to_date = today.strftime('%Y-%m-%d')
|
| 66 |
|
| 67 |
base_url = "https://newsapi.org/v2/everything"
|
| 68 |
url = (
|
| 69 |
f"{base_url}?q={topic}&from={from_date}&to={to_date}"
|
| 70 |
-
f"&sortBy={sort_by}&pageSize=
|
| 71 |
)
|
| 72 |
|
| 73 |
try:
|
|
@@ -103,6 +108,10 @@ def query(topic, sort_by="popularity", max_tokens=100):
|
|
| 103 |
|
| 104 |
|
| 105 |
def process_data(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
df_cleaned = df.dropna(subset=["title", "description"])
|
| 107 |
df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
|
| 108 |
df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
|
|
@@ -110,15 +119,20 @@ def process_data(df):
|
|
| 110 |
df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
|
| 111 |
return df_cleaned
|
| 112 |
|
|
|
|
| 113 |
def analyse_sentiment(df):
|
| 114 |
analyser = SentimentIntensityAnalyzer()
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
def
|
| 122 |
if score >= 0.05:
|
| 123 |
return "positive"
|
| 124 |
elif score <= -0.05:
|
|
@@ -126,7 +140,7 @@ def analyse_sentiment(df):
|
|
| 126 |
else:
|
| 127 |
return "neutral"
|
| 128 |
|
| 129 |
-
df['sentiment_label'] = df['compound'].apply(
|
| 130 |
return df
|
| 131 |
|
| 132 |
def get_bias_label(source_name):
|
|
@@ -175,7 +189,10 @@ def summarise_text(row, max_tokens=512):
|
|
| 175 |
return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
|
| 176 |
|
| 177 |
def add_article_summaries(df, max_tokens=512):
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
| 179 |
df[['summary', 'bias_score', 'source']] = summary_df
|
| 180 |
return df
|
| 181 |
|
|
|
|
| 5 |
import nltk
|
| 6 |
from datetime import datetime, timedelta
|
| 7 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
nltk.data.find('sentiment/vader_lexicon')
|
| 11 |
+
except LookupError:
|
| 12 |
+
nltk.download('vader_lexicon')
|
| 13 |
|
| 14 |
from transformers import pipeline
|
| 15 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
| 16 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
import os
|
|
|
|
| 65 |
return None
|
| 66 |
|
| 67 |
today = datetime.today()
|
| 68 |
+
last_week = today - timedelta(days=7)
|
| 69 |
from_date = last_week.strftime('%Y-%m-%d')
|
| 70 |
to_date = today.strftime('%Y-%m-%d')
|
| 71 |
|
| 72 |
base_url = "https://newsapi.org/v2/everything"
|
| 73 |
url = (
|
| 74 |
f"{base_url}?q={topic}&from={from_date}&to={to_date}"
|
| 75 |
+
f"&sortBy={sort_by}&pageSize=10&apiKey={api_key}"
|
| 76 |
)
|
| 77 |
|
| 78 |
try:
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def process_data(df):
|
| 111 |
+
if df is None or df.empty or not all(col in df.columns for col in ["title", "description"]):
|
| 112 |
+
print("Invalid or empty DataFrame passed to process_data()")
|
| 113 |
+
return pd.DataFrame()
|
| 114 |
+
|
| 115 |
df_cleaned = df.dropna(subset=["title", "description"])
|
| 116 |
df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
|
| 117 |
df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
|
|
|
|
| 119 |
df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
|
| 120 |
return df_cleaned
|
| 121 |
|
| 122 |
+
|
| 123 |
def analyse_sentiment(df):
|
| 124 |
analyser = SentimentIntensityAnalyzer()
|
| 125 |
|
| 126 |
+
def get_scores(text):
|
| 127 |
+
scores = analyser.polarity_scores(text)
|
| 128 |
+
return scores['compound'], scores['neg'], scores['neu'], scores['pos']
|
| 129 |
+
|
| 130 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 131 |
+
results = list(executor.map(get_scores, df['text']))
|
| 132 |
+
|
| 133 |
+
df[['compound', 'neg', 'neu', 'pos']] = results
|
| 134 |
|
| 135 |
+
def label_sentiment(score):
|
| 136 |
if score >= 0.05:
|
| 137 |
return "positive"
|
| 138 |
elif score <= -0.05:
|
|
|
|
| 140 |
else:
|
| 141 |
return "neutral"
|
| 142 |
|
| 143 |
+
df['sentiment_label'] = df['compound'].apply(label_sentiment)
|
| 144 |
return df
|
| 145 |
|
| 146 |
def get_bias_label(source_name):
|
|
|
|
| 189 |
return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
|
| 190 |
|
| 191 |
def add_article_summaries(df, max_tokens=512):
|
| 192 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 193 |
+
summaries = list(executor.map(lambda row: summarise_text(row, max_tokens), df.to_dict('records')))
|
| 194 |
+
|
| 195 |
+
summary_df = pd.DataFrame(summaries)
|
| 196 |
df[['summary', 'bias_score', 'source']] = summary_df
|
| 197 |
return df
|
| 198 |
|