Spaces:
Running
Running
updating analyzers to return flagged_phrases list for each.
Browse filesupdating scraper to maintain article formatting on return to front end.
app/routers/analyze.py
CHANGED
@@ -53,7 +53,6 @@ class AnalysisResponse(BaseModel):
|
|
53 |
bias: str
|
54 |
bias_score: float
|
55 |
bias_percentage: float
|
56 |
-
flagged_phrases: List[str]
|
57 |
media_score: MediaScore
|
58 |
|
59 |
@router.post("/analyze", response_model=AnalysisResponse)
|
@@ -109,14 +108,13 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
109 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
110 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
111 |
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
112 |
-
"flagged_phrases": list(analysis['details']['sentiment_analysis']['flagged_phrases']),
|
113 |
"media_score": {
|
114 |
"media_unmasked_score": float(analysis['media_unmasked_score']),
|
115 |
"rating": str(analysis['rating']),
|
116 |
"details": {
|
117 |
"headline_analysis": {
|
118 |
"headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
|
119 |
-
"
|
120 |
},
|
121 |
"sentiment_analysis": {
|
122 |
"sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
|
@@ -126,10 +124,12 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
126 |
"bias_analysis": {
|
127 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
128 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
129 |
-
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage'])
|
|
|
130 |
},
|
131 |
"evidence_analysis": {
|
132 |
-
"evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score'])
|
|
|
133 |
}
|
134 |
}
|
135 |
}
|
@@ -144,7 +144,6 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
144 |
'bias': response_dict['bias'],
|
145 |
'bias_score': response_dict['bias_score'],
|
146 |
'bias_percentage': response_dict['bias_percentage'],
|
147 |
-
'flagged_phrases': response_dict['flagged_phrases'],
|
148 |
'media_score': response_dict['media_score']
|
149 |
}).execute()
|
150 |
|
@@ -157,39 +156,3 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
157 |
status_code=500,
|
158 |
detail=f"Analysis failed: {str(e)}"
|
159 |
)
|
160 |
-
|
161 |
-
@router.get("/debug")
|
162 |
-
async def debug_response():
|
163 |
-
mock_analysis = {
|
164 |
-
"headline": "Test Headline",
|
165 |
-
"content": "Test content",
|
166 |
-
"sentiment": "Neutral",
|
167 |
-
"bias": "Neutral",
|
168 |
-
"bias_score": 0.75, # Note: 0-1 scale
|
169 |
-
"bias_percentage": 0,
|
170 |
-
"flagged_phrases": ["test phrase"],
|
171 |
-
"media_score": {
|
172 |
-
"media_unmasked_score": 75.5,
|
173 |
-
"rating": "Some Bias Present",
|
174 |
-
"details": {
|
175 |
-
"headline_analysis": {
|
176 |
-
"headline_vs_content_score": 20,
|
177 |
-
"contradictory_phrases": ["Sample contradiction"]
|
178 |
-
},
|
179 |
-
"sentiment_analysis": {
|
180 |
-
"sentiment": "Neutral",
|
181 |
-
"manipulation_score": 30,
|
182 |
-
"flagged_phrases": ["Sample manipulative phrase"]
|
183 |
-
},
|
184 |
-
"bias_analysis": {
|
185 |
-
"bias": "Neutral",
|
186 |
-
"bias_score": 0.75,
|
187 |
-
"bias_percentage": 0
|
188 |
-
},
|
189 |
-
"evidence_analysis": {
|
190 |
-
"evidence_based_score": 80
|
191 |
-
}
|
192 |
-
}
|
193 |
-
}
|
194 |
-
}
|
195 |
-
return AnalysisResponse.parse_obj(mock_analysis)
|
|
|
53 |
bias: str
|
54 |
bias_score: float
|
55 |
bias_percentage: float
|
|
|
56 |
media_score: MediaScore
|
57 |
|
58 |
@router.post("/analyze", response_model=AnalysisResponse)
|
|
|
108 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
109 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
110 |
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
|
|
111 |
"media_score": {
|
112 |
"media_unmasked_score": float(analysis['media_unmasked_score']),
|
113 |
"rating": str(analysis['rating']),
|
114 |
"details": {
|
115 |
"headline_analysis": {
|
116 |
"headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
|
117 |
+
"flagged_phrases": analysis['details']['headline_analysis'].get('flagged_phrases', [])
|
118 |
},
|
119 |
"sentiment_analysis": {
|
120 |
"sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
|
|
|
124 |
"bias_analysis": {
|
125 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
126 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
127 |
+
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
128 |
+
"flagged_phrases": list(analysis['details']['bias_analysis']['flagged_phrases'])
|
129 |
},
|
130 |
"evidence_analysis": {
|
131 |
+
"evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score']),
|
132 |
+
"flagged_phrases": list(analysis['details']['evidence_analysis']['flagged_phrases'])
|
133 |
}
|
134 |
}
|
135 |
}
|
|
|
144 |
'bias': response_dict['bias'],
|
145 |
'bias_score': response_dict['bias_score'],
|
146 |
'bias_percentage': response_dict['bias_percentage'],
|
|
|
147 |
'media_score': response_dict['media_score']
|
148 |
}).execute()
|
149 |
|
|
|
156 |
status_code=500,
|
157 |
detail=f"Analysis failed: {str(e)}"
|
158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mediaunmasked/analyzers/bias_analyzer.py
CHANGED
@@ -25,16 +25,21 @@ class BiasAnalyzer:
|
|
25 |
try:
|
26 |
text_lower = text.lower()
|
27 |
|
28 |
-
|
|
|
|
|
29 |
left_count = sum(1 for word in self.left_keywords if word in text_lower)
|
|
|
30 |
right_count = sum(1 for word in self.right_keywords if word in text_lower)
|
|
|
31 |
|
32 |
total_words = left_count + right_count
|
33 |
if total_words == 0:
|
34 |
return {
|
35 |
"bias": "Neutral",
|
36 |
"bias_score": 0.0, # True neutral
|
37 |
-
"bias_percentage": 0 # Neutral percentage
|
|
|
38 |
}
|
39 |
|
40 |
# New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
|
@@ -63,7 +68,8 @@ class BiasAnalyzer:
|
|
63 |
return {
|
64 |
"bias": bias,
|
65 |
"bias_score": round(bias_score, 2), # Keep 2 decimal places
|
66 |
-
"bias_percentage": abs(round(bias_percentage, 1))
|
|
|
67 |
}
|
68 |
|
69 |
except Exception as e:
|
@@ -71,5 +77,6 @@ class BiasAnalyzer:
|
|
71 |
return {
|
72 |
"bias": "Error",
|
73 |
"bias_score": 0.0,
|
74 |
-
"bias_percentage": 0
|
|
|
75 |
}
|
|
|
25 |
try:
|
26 |
text_lower = text.lower()
|
27 |
|
28 |
+
flagged_phrases = []
|
29 |
+
|
30 |
+
# Count matches and collect flagged phrases
|
31 |
left_count = sum(1 for word in self.left_keywords if word in text_lower)
|
32 |
+
flagged_phrases.extend([word for word in self.left_keywords if word in text_lower])
|
33 |
right_count = sum(1 for word in self.right_keywords if word in text_lower)
|
34 |
+
flagged_phrases.extend([word for word in self.right_keywords if word in text_lower])
|
35 |
|
36 |
total_words = left_count + right_count
|
37 |
if total_words == 0:
|
38 |
return {
|
39 |
"bias": "Neutral",
|
40 |
"bias_score": 0.0, # True neutral
|
41 |
+
"bias_percentage": 0, # Neutral percentage
|
42 |
+
"flagged_phrases": []
|
43 |
}
|
44 |
|
45 |
# New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
|
|
|
68 |
return {
|
69 |
"bias": bias,
|
70 |
"bias_score": round(bias_score, 2), # Keep 2 decimal places
|
71 |
+
"bias_percentage": abs(round(bias_percentage, 1)),
|
72 |
+
"flagged_phrases": flagged_phrases
|
73 |
}
|
74 |
|
75 |
except Exception as e:
|
|
|
77 |
return {
|
78 |
"bias": "Error",
|
79 |
"bias_score": 0.0,
|
80 |
+
"bias_percentage": 0,
|
81 |
+
"flagged_phrases": []
|
82 |
}
|
mediaunmasked/analyzers/headline_analyzer.py
CHANGED
@@ -3,6 +3,8 @@ from typing import Dict, Any, List
|
|
3 |
from transformers import pipeline
|
4 |
from transformers import AutoTokenizer
|
5 |
import numpy as np
|
|
|
|
|
6 |
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
@@ -54,19 +56,35 @@ class HeadlineAnalyzer:
|
|
54 |
|
55 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
|
56 |
"""Analyze a single section of content."""
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
logger.info("\nSection Analysis:")
|
64 |
logger.info("-"*30)
|
65 |
logger.info(f"Section preview: {section[:100]}...")
|
66 |
for label, score in scores.items():
|
67 |
logger.info(f"Label: {label:<12} Score: {score:.3f}")
|
68 |
|
69 |
-
return scores
|
70 |
|
71 |
def analyze(self, headline: str, content: str) -> Dict[str, Any]:
|
72 |
"""Analyze how well the headline matches the content using an AI model."""
|
@@ -146,7 +164,7 @@ class HeadlineAnalyzer:
|
|
146 |
"headline_vs_content_score": round(final_score, 1),
|
147 |
"entailment_score": round(entailment_score, 2),
|
148 |
"contradiction_score": round(contradiction_score, 2),
|
149 |
-
"contradictory_phrases": []
|
150 |
}
|
151 |
|
152 |
except Exception as e:
|
|
|
3 |
from transformers import pipeline
|
4 |
from transformers import AutoTokenizer
|
5 |
import numpy as np
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import sent_tokenize
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
56 |
|
57 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
|
58 |
"""Analyze a single section of content."""
|
59 |
+
# Use a more robust method for sentence splitting
|
60 |
+
nltk.download('punkt')
|
61 |
+
sentences = sent_tokenize(section)
|
62 |
+
|
63 |
+
flagged_phrases = []
|
64 |
+
for sentence in sentences:
|
65 |
+
input_text = f"{headline} [SEP] {sentence}"
|
66 |
+
result = self.nli_pipeline(input_text, top_k=None)
|
67 |
+
scores = {item['label']: item['score'] for item in result}
|
68 |
+
|
69 |
+
# Log the model output for debugging
|
70 |
+
logger.info(f"Sentence: {sentence}")
|
71 |
+
logger.info(f"Scores: {scores}")
|
72 |
+
|
73 |
+
# Set the threshold for contradiction to anything higher than 0.1
|
74 |
+
if scores.get('CONTRADICTION', 0) > 0.1: # Threshold set to > 0.1
|
75 |
+
flagged_phrases.append(sentence)
|
76 |
+
|
77 |
+
# Adjust the headline_vs_content_score based on contradictions
|
78 |
+
contradiction_penalty = len(flagged_phrases) * 0.1 # Example penalty per contradiction
|
79 |
+
adjusted_score = max(0, scores.get('ENTAILMENT', 0) - contradiction_penalty)
|
80 |
+
|
81 |
logger.info("\nSection Analysis:")
|
82 |
logger.info("-"*30)
|
83 |
logger.info(f"Section preview: {section[:100]}...")
|
84 |
for label, score in scores.items():
|
85 |
logger.info(f"Label: {label:<12} Score: {score:.3f}")
|
86 |
|
87 |
+
return {"scores": scores, "flagged_phrases": flagged_phrases, "adjusted_score": adjusted_score}
|
88 |
|
89 |
def analyze(self, headline: str, content: str) -> Dict[str, Any]:
|
90 |
"""Analyze how well the headline matches the content using an AI model."""
|
|
|
164 |
"headline_vs_content_score": round(final_score, 1),
|
165 |
"entailment_score": round(entailment_score, 2),
|
166 |
"contradiction_score": round(contradiction_score, 2),
|
167 |
+
"contradictory_phrases": scores.get('flagged_phrases', [])
|
168 |
}
|
169 |
|
170 |
except Exception as e:
|
mediaunmasked/analyzers/scoring.py
CHANGED
@@ -75,10 +75,25 @@ class MediaScorer:
|
|
75 |
"media_unmasked_score": round(final_score, 1),
|
76 |
"rating": rating,
|
77 |
"details": {
|
78 |
-
"headline_analysis":
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
}
|
83 |
}
|
84 |
|
@@ -93,9 +108,9 @@ class MediaScorer:
|
|
93 |
"media_unmasked_score": 0,
|
94 |
"rating": "Error",
|
95 |
"details": {
|
96 |
-
"headline_analysis": {"headline_vs_content_score": 0, "
|
97 |
"sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
|
98 |
-
"bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0},
|
99 |
-
"evidence_analysis": {"evidence_based_score": 0}
|
100 |
}
|
101 |
}
|
|
|
75 |
"media_unmasked_score": round(final_score, 1),
|
76 |
"rating": rating,
|
77 |
"details": {
|
78 |
+
"headline_analysis": {
|
79 |
+
"headline_vs_content_score": headline_analysis["headline_vs_content_score"],
|
80 |
+
"flagged_phrases": headline_analysis.get("flagged_phrases", [])
|
81 |
+
},
|
82 |
+
"sentiment_analysis": {
|
83 |
+
"sentiment": sentiment_analysis["sentiment"],
|
84 |
+
"manipulation_score": sentiment_analysis["manipulation_score"],
|
85 |
+
"flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
|
86 |
+
},
|
87 |
+
"bias_analysis": {
|
88 |
+
"bias": bias_analysis["bias"],
|
89 |
+
"bias_score": bias_analysis["bias_score"],
|
90 |
+
"bias_percentage": bias_analysis["bias_percentage"],
|
91 |
+
"flagged_phrases": bias_analysis.get("flagged_phrases", [])
|
92 |
+
},
|
93 |
+
"evidence_analysis": {
|
94 |
+
"evidence_based_score": evidence_analysis["evidence_based_score"],
|
95 |
+
"flagged_phrases": evidence_analysis.get("flagged_phrases", [])
|
96 |
+
}
|
97 |
}
|
98 |
}
|
99 |
|
|
|
108 |
"media_unmasked_score": 0,
|
109 |
"rating": "Error",
|
110 |
"details": {
|
111 |
+
"headline_analysis": {"headline_vs_content_score": 0, "flagged_phrases": []},
|
112 |
"sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
|
113 |
+
"bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0, "flagged_phrases": []},
|
114 |
+
"evidence_analysis": {"evidence_based_score": 0, "flagged_phrases": []}
|
115 |
}
|
116 |
}
|
mediaunmasked/scrapers/article_scraper.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import Dict, Optional
|
2 |
import logging
|
3 |
from urllib.parse import urlparse
|
4 |
import requests
|
@@ -25,17 +25,15 @@ class ArticleScraper:
|
|
25 |
response = self.session.get(url)
|
26 |
response.raise_for_status()
|
27 |
return response.text
|
28 |
-
|
29 |
except Exception as e:
|
30 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
31 |
return None
|
32 |
|
33 |
def _process_element(self, element) -> str:
|
34 |
-
"""Process an HTML element while preserving
|
35 |
if isinstance(element, NavigableString):
|
36 |
return str(element)
|
37 |
-
|
38 |
-
# Handle different types of elements
|
39 |
tag_name = element.name
|
40 |
|
41 |
if tag_name in ['p', 'div']:
|
@@ -64,90 +62,63 @@ class ArticleScraper:
|
|
64 |
|
65 |
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
66 |
level = int(tag_name[1])
|
67 |
-
prefix = '#' * (level + 1) # Add one more #
|
68 |
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
|
69 |
|
70 |
-
# For other elements, just process their children
|
71 |
return ''.join(self._process_element(child) for child in element.children)
|
72 |
|
73 |
def _extract_content(self, container) -> str:
|
74 |
"""Extract and format content from a container element."""
|
75 |
if not container:
|
76 |
return ''
|
77 |
-
|
78 |
-
# Remove unwanted elements
|
79 |
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
|
80 |
unwanted.decompose()
|
81 |
-
|
82 |
-
# Process the container
|
83 |
content = self._process_element(container)
|
84 |
|
85 |
-
# Clean up extra whitespace and newlines
|
86 |
content = '\n'.join(line.strip() for line in content.split('\n'))
|
87 |
content = '\n'.join(filter(None, content.split('\n')))
|
88 |
|
89 |
return content.strip()
|
90 |
|
91 |
-
def
|
92 |
-
"""Extract content from
|
93 |
try:
|
94 |
-
|
95 |
-
headline =
|
96 |
-
headline_selectors = {
|
97 |
-
'politifact.com': ['h1.article__title'],
|
98 |
-
'snopes.com': ['header h1', 'article h1']
|
99 |
-
}
|
100 |
-
|
101 |
-
# Try domain-specific headline selectors
|
102 |
-
if domain in headline_selectors:
|
103 |
-
for selector in headline_selectors[domain]:
|
104 |
-
headline = soup.select_one(selector)
|
105 |
-
if headline:
|
106 |
-
break
|
107 |
-
|
108 |
-
# Fallback to any h1 if no domain-specific headline found
|
109 |
-
if not headline:
|
110 |
-
headline = soup.find('h1')
|
111 |
-
|
112 |
-
headline_text = headline.get_text().strip() if headline else "No headline found"
|
113 |
-
self.logger.info(f"Found headline: {headline_text}")
|
114 |
-
|
115 |
-
# Find content - try domain-specific selectors first, then fallback to generic
|
116 |
-
content_div = None
|
117 |
-
content_selectors = {
|
118 |
-
'politifact.com': ['article.article', '.article__text', '.m-textblock'],
|
119 |
-
'snopes.com': ['article']
|
120 |
-
}
|
121 |
-
|
122 |
-
# Try domain-specific content selectors
|
123 |
-
if domain in content_selectors:
|
124 |
-
for selector in content_selectors[domain]:
|
125 |
-
content_div = soup.select_one(selector)
|
126 |
-
if content_div:
|
127 |
-
break
|
128 |
-
|
129 |
-
# Fallback to generic content selectors
|
130 |
-
if not content_div:
|
131 |
-
for selector in ['article', 'main', '.content', '.article-content']:
|
132 |
-
content_div = soup.select_one(selector)
|
133 |
-
if content_div:
|
134 |
-
break
|
135 |
-
|
136 |
-
content = self._extract_content(content_div) if content_div else "No content found"
|
137 |
|
138 |
-
|
139 |
-
self.logger.warning("No content found in article")
|
140 |
-
self.logger.debug(f"Domain: {domain}")
|
141 |
-
|
142 |
-
return {"headline": headline_text, "content": content}
|
143 |
|
|
|
|
|
|
|
|
|
|
|
144 |
except Exception as e:
|
145 |
-
self.logger.error(f"Error extracting
|
146 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
149 |
"""
|
150 |
-
Main function to scrape
|
151 |
Returns a dictionary with headline and content.
|
152 |
"""
|
153 |
html_content = self._fetch_page(url)
|
@@ -159,4 +130,8 @@ class ArticleScraper:
|
|
159 |
domain = self._get_domain(url)
|
160 |
|
161 |
self.logger.info(f"Scraping article from domain: {domain}")
|
162 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Optional
|
2 |
import logging
|
3 |
from urllib.parse import urlparse
|
4 |
import requests
|
|
|
25 |
response = self.session.get(url)
|
26 |
response.raise_for_status()
|
27 |
return response.text
|
|
|
28 |
except Exception as e:
|
29 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
30 |
return None
|
31 |
|
32 |
def _process_element(self, element) -> str:
|
33 |
+
"""Process an HTML element while preserving structure and formatting."""
|
34 |
if isinstance(element, NavigableString):
|
35 |
return str(element)
|
36 |
+
|
|
|
37 |
tag_name = element.name
|
38 |
|
39 |
if tag_name in ['p', 'div']:
|
|
|
62 |
|
63 |
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
64 |
level = int(tag_name[1])
|
65 |
+
prefix = '#' * (level + 1) # Add one more # for clarity
|
66 |
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
|
67 |
|
|
|
68 |
return ''.join(self._process_element(child) for child in element.children)
|
69 |
|
70 |
def _extract_content(self, container) -> str:
|
71 |
"""Extract and format content from a container element."""
|
72 |
if not container:
|
73 |
return ''
|
74 |
+
|
|
|
75 |
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
|
76 |
unwanted.decompose()
|
77 |
+
|
|
|
78 |
content = self._process_element(container)
|
79 |
|
|
|
80 |
content = '\n'.join(line.strip() for line in content.split('\n'))
|
81 |
content = '\n'.join(filter(None, content.split('\n')))
|
82 |
|
83 |
return content.strip()
|
84 |
|
85 |
+
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
|
86 |
+
"""Extract content from PolitiFact articles."""
|
87 |
try:
|
88 |
+
headline = soup.find('h1', class_='article__title') or soup.find('h1')
|
89 |
+
headline = headline.get_text(strip=True) if headline else "No headline found"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
self.logger.info(f"Found headline: {headline}")
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
|
94 |
+
content = self._extract_content(content_div) if content_div else "No content found"
|
95 |
+
|
96 |
+
return {"headline": headline, "content": content}
|
97 |
+
|
98 |
except Exception as e:
|
99 |
+
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
|
100 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
101 |
|
102 |
+
def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
|
103 |
+
"""Fallback extraction method for unknown domains."""
|
104 |
+
headline = soup.find('h1')
|
105 |
+
headline_text = headline.get_text().strip() if headline else "No headline found"
|
106 |
+
|
107 |
+
content_div = None
|
108 |
+
common_selectors = ['article', 'main', '.content', '.article-content']
|
109 |
+
|
110 |
+
for selector in common_selectors:
|
111 |
+
content_div = soup.select_one(selector)
|
112 |
+
if content_div:
|
113 |
+
break
|
114 |
+
|
115 |
+
content = self._extract_content(content_div) if content_div else "No content found"
|
116 |
+
|
117 |
+
return {"headline": headline_text, "content": content}
|
118 |
+
|
119 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
120 |
"""
|
121 |
+
Main function to scrape articles while maintaining structure.
|
122 |
Returns a dictionary with headline and content.
|
123 |
"""
|
124 |
html_content = self._fetch_page(url)
|
|
|
130 |
domain = self._get_domain(url)
|
131 |
|
132 |
self.logger.info(f"Scraping article from domain: {domain}")
|
133 |
+
|
134 |
+
if 'politifact.com' in domain:
|
135 |
+
return self._extract_politifact(soup)
|
136 |
+
|
137 |
+
return self._extract_generic(soup, domain)
|