ihaveaplan66 commited on
Commit
bdfd7d2
·
verified ·
1 Parent(s): 64a6581

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +99 -95
main.py CHANGED
@@ -1,95 +1,99 @@
1
- import requests
2
- from collections import Counter
3
- from transformers import pipeline
4
- import nltk
5
- from nltk.tokenize import word_tokenize
6
- from nltk.corpus import stopwords
7
- import string
8
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
- import torch
10
-
11
- nltk.download('punkt')
12
- nltk.download('stopwords')
13
- nltk.download('averaged_perceptron_tagger')
14
- nltk.download('punkt_tab')
15
-
16
-
17
- # 1. Function for getting news via NewsAPI
18
- def get_news(query, api_key, num_articles=5):
19
- url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
20
- response = requests.get(url)
21
- if response.status_code == 200:
22
- return response.json()['articles']
23
- return []
24
-
25
-
26
- # 2. Analyzing tone with Hugging Face
27
- tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")
28
-
29
- def analyze_sentiment(text):
30
- return tone_analyzer(text)[0]
31
-
32
-
33
- # 3. Define category
34
-
35
- category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
36
- category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
37
- labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
38
- 'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
39
- 'sports', 'travel', 'weather', 'world news', 'none']
40
-
41
- def classify_category(text):
42
- inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
43
- outputs = category_model(**inputs)
44
- predicted_class = torch.argmax(outputs.logits, dim=1).item()
45
- return labels[predicted_class]
46
-
47
-
48
- # 4. Summarization
49
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
50
-
51
- def split_text(text, max_tokens=512):
52
- words = text.split()
53
- return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
54
-
55
- def summarize_text(text):
56
- chunks = split_text(text)
57
- summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
58
- return ' '.join(summaries)
59
-
60
-
61
- # 5. Search for trending words
62
- def extract_trending_words(texts):
63
- text = ' '.join(texts).lower()
64
- words = word_tokenize(text)
65
- words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
66
- word_freq = Counter(words)
67
- return word_freq.most_common(10)
68
-
69
- # 6. The main process of analyzing news
70
- def analyze_news(query, api_key, num_articles=5):
71
- articles = get_news(query, api_key, num_articles)
72
-
73
- if not articles:
74
- return []
75
-
76
- news_results = []
77
- for article in articles:
78
- title = article.get('title', 'No Title')
79
- description = article.get('description', '') or ''
80
- url = article.get('url', '#')
81
-
82
- sentiment = analyze_sentiment(title + " " + description)['label']
83
- category = classify_category(title + " " + description)
84
- summary = summarize_text(title + " " + description)
85
-
86
- news_results.append({
87
- "title": title,
88
- "url": url,
89
- "sentiment": sentiment,
90
- "category": category,
91
- "summary": summary
92
- })
93
-
94
- return news_results
95
-
 
 
 
 
 
1
+ import requests
2
+ from collections import Counter
3
+ from transformers import pipeline
4
+ import nltk
5
+ from nltk.tokenize import word_tokenize
6
+ from nltk.corpus import stopwords
7
+ import string
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ import torch
10
+ import os
11
+
12
+ nltk.data.path.append('/app/nltk_data')
13
+ os.environ['TRANSFORMERS_CACHE'] = '/app/transformers_cache'
14
+
15
+ nltk.download('punkt')
16
+ nltk.download('stopwords')
17
+ nltk.download('averaged_perceptron_tagger')
18
+ nltk.download('punkt_tab')
19
+
20
+
21
+ # 1. Function for getting news via NewsAPI
22
+ def get_news(query, api_key, num_articles=5):
23
+ url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
24
+ response = requests.get(url)
25
+ if response.status_code == 200:
26
+ return response.json()['articles']
27
+ return []
28
+
29
+
30
+ # 2. Analyzing tone with Hugging Face
31
+ tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")
32
+
33
+ def analyze_sentiment(text):
34
+ return tone_analyzer(text)[0]
35
+
36
+
37
+ # 3. Define category
38
+
39
+ category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
40
+ category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
41
+ labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
42
+ 'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
43
+ 'sports', 'travel', 'weather', 'world news', 'none']
44
+
45
+ def classify_category(text):
46
+ inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
47
+ outputs = category_model(**inputs)
48
+ predicted_class = torch.argmax(outputs.logits, dim=1).item()
49
+ return labels[predicted_class]
50
+
51
+
52
+ # 4. Summarization
53
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
54
+
55
+ def split_text(text, max_tokens=512):
56
+ words = text.split()
57
+ return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
58
+
59
+ def summarize_text(text):
60
+ chunks = split_text(text)
61
+ summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
62
+ return ' '.join(summaries)
63
+
64
+
65
+ # 5. Search for trending words
66
+ def extract_trending_words(texts):
67
+ text = ' '.join(texts).lower()
68
+ words = word_tokenize(text)
69
+ words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
70
+ word_freq = Counter(words)
71
+ return word_freq.most_common(10)
72
+
73
+ # 6. The main process of analyzing news
74
+ def analyze_news(query, api_key, num_articles=5):
75
+ articles = get_news(query, api_key, num_articles)
76
+
77
+ if not articles:
78
+ return []
79
+
80
+ news_results = []
81
+ for article in articles:
82
+ title = article.get('title', 'No Title')
83
+ description = article.get('description', '') or ''
84
+ url = article.get('url', '#')
85
+
86
+ sentiment = analyze_sentiment(title + " " + description)['label']
87
+ category = classify_category(title + " " + description)
88
+ summary = summarize_text(title + " " + description)
89
+
90
+ news_results.append({
91
+ "title": title,
92
+ "url": url,
93
+ "sentiment": sentiment,
94
+ "category": category,
95
+ "summary": summary
96
+ })
97
+
98
+ return news_results
99
+