wozwize commited on
Commit
55cdb25
·
1 Parent(s): a9d5552

updating analyzers to return flagged_phrases list for each.

Browse files

updating scraper to maintain article formatting on return to front end.

app/routers/analyze.py CHANGED
@@ -53,7 +53,6 @@ class AnalysisResponse(BaseModel):
53
  bias: str
54
  bias_score: float
55
  bias_percentage: float
56
- flagged_phrases: List[str]
57
  media_score: MediaScore
58
 
59
  @router.post("/analyze", response_model=AnalysisResponse)
@@ -109,14 +108,13 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
109
  "bias": str(analysis['details']['bias_analysis']['bias']),
110
  "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
111
  "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
112
- "flagged_phrases": list(analysis['details']['sentiment_analysis']['flagged_phrases']),
113
  "media_score": {
114
  "media_unmasked_score": float(analysis['media_unmasked_score']),
115
  "rating": str(analysis['rating']),
116
  "details": {
117
  "headline_analysis": {
118
  "headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
119
- "contradictory_phrases": analysis['details']['headline_analysis'].get('contradictory_phrases', [])
120
  },
121
  "sentiment_analysis": {
122
  "sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
@@ -126,10 +124,12 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
126
  "bias_analysis": {
127
  "bias": str(analysis['details']['bias_analysis']['bias']),
128
  "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
129
- "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage'])
 
130
  },
131
  "evidence_analysis": {
132
- "evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score'])
 
133
  }
134
  }
135
  }
@@ -144,7 +144,6 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
144
  'bias': response_dict['bias'],
145
  'bias_score': response_dict['bias_score'],
146
  'bias_percentage': response_dict['bias_percentage'],
147
- 'flagged_phrases': response_dict['flagged_phrases'],
148
  'media_score': response_dict['media_score']
149
  }).execute()
150
 
@@ -157,39 +156,3 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
157
  status_code=500,
158
  detail=f"Analysis failed: {str(e)}"
159
  )
160
-
161
- @router.get("/debug")
162
- async def debug_response():
163
- mock_analysis = {
164
- "headline": "Test Headline",
165
- "content": "Test content",
166
- "sentiment": "Neutral",
167
- "bias": "Neutral",
168
- "bias_score": 0.75, # Note: 0-1 scale
169
- "bias_percentage": 0,
170
- "flagged_phrases": ["test phrase"],
171
- "media_score": {
172
- "media_unmasked_score": 75.5,
173
- "rating": "Some Bias Present",
174
- "details": {
175
- "headline_analysis": {
176
- "headline_vs_content_score": 20,
177
- "contradictory_phrases": ["Sample contradiction"]
178
- },
179
- "sentiment_analysis": {
180
- "sentiment": "Neutral",
181
- "manipulation_score": 30,
182
- "flagged_phrases": ["Sample manipulative phrase"]
183
- },
184
- "bias_analysis": {
185
- "bias": "Neutral",
186
- "bias_score": 0.75,
187
- "bias_percentage": 0
188
- },
189
- "evidence_analysis": {
190
- "evidence_based_score": 80
191
- }
192
- }
193
- }
194
- }
195
- return AnalysisResponse.parse_obj(mock_analysis)
 
53
  bias: str
54
  bias_score: float
55
  bias_percentage: float
 
56
  media_score: MediaScore
57
 
58
  @router.post("/analyze", response_model=AnalysisResponse)
 
108
  "bias": str(analysis['details']['bias_analysis']['bias']),
109
  "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
110
  "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
 
111
  "media_score": {
112
  "media_unmasked_score": float(analysis['media_unmasked_score']),
113
  "rating": str(analysis['rating']),
114
  "details": {
115
  "headline_analysis": {
116
  "headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
117
+ "flagged_phrases": analysis['details']['headline_analysis'].get('flagged_phrases', [])
118
  },
119
  "sentiment_analysis": {
120
  "sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
 
124
  "bias_analysis": {
125
  "bias": str(analysis['details']['bias_analysis']['bias']),
126
  "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
127
+ "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
128
+ "flagged_phrases": list(analysis['details']['bias_analysis']['flagged_phrases'])
129
  },
130
  "evidence_analysis": {
131
+ "evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score']),
132
+ "flagged_phrases": list(analysis['details']['evidence_analysis']['flagged_phrases'])
133
  }
134
  }
135
  }
 
144
  'bias': response_dict['bias'],
145
  'bias_score': response_dict['bias_score'],
146
  'bias_percentage': response_dict['bias_percentage'],
 
147
  'media_score': response_dict['media_score']
148
  }).execute()
149
 
 
156
  status_code=500,
157
  detail=f"Analysis failed: {str(e)}"
158
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mediaunmasked/analyzers/bias_analyzer.py CHANGED
@@ -25,16 +25,21 @@ class BiasAnalyzer:
25
  try:
26
  text_lower = text.lower()
27
 
28
- # Count matches
 
 
29
  left_count = sum(1 for word in self.left_keywords if word in text_lower)
 
30
  right_count = sum(1 for word in self.right_keywords if word in text_lower)
 
31
 
32
  total_words = left_count + right_count
33
  if total_words == 0:
34
  return {
35
  "bias": "Neutral",
36
  "bias_score": 0.0, # True neutral
37
- "bias_percentage": 0 # Neutral percentage
 
38
  }
39
 
40
  # New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
@@ -63,7 +68,8 @@ class BiasAnalyzer:
63
  return {
64
  "bias": bias,
65
  "bias_score": round(bias_score, 2), # Keep 2 decimal places
66
- "bias_percentage": abs(round(bias_percentage, 1))
 
67
  }
68
 
69
  except Exception as e:
@@ -71,5 +77,6 @@ class BiasAnalyzer:
71
  return {
72
  "bias": "Error",
73
  "bias_score": 0.0,
74
- "bias_percentage": 0
 
75
  }
 
25
  try:
26
  text_lower = text.lower()
27
 
28
+ flagged_phrases = []
29
+
30
+ # Count matches and collect flagged phrases
31
  left_count = sum(1 for word in self.left_keywords if word in text_lower)
32
+ flagged_phrases.extend([word for word in self.left_keywords if word in text_lower])
33
  right_count = sum(1 for word in self.right_keywords if word in text_lower)
34
+ flagged_phrases.extend([word for word in self.right_keywords if word in text_lower])
35
 
36
  total_words = left_count + right_count
37
  if total_words == 0:
38
  return {
39
  "bias": "Neutral",
40
  "bias_score": 0.0, # True neutral
41
+ "bias_percentage": 0, # Neutral percentage
42
+ "flagged_phrases": []
43
  }
44
 
45
  # New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
 
68
  return {
69
  "bias": bias,
70
  "bias_score": round(bias_score, 2), # Keep 2 decimal places
71
+ "bias_percentage": abs(round(bias_percentage, 1)),
72
+ "flagged_phrases": flagged_phrases
73
  }
74
 
75
  except Exception as e:
 
77
  return {
78
  "bias": "Error",
79
  "bias_score": 0.0,
80
+ "bias_percentage": 0,
81
+ "flagged_phrases": []
82
  }
mediaunmasked/analyzers/headline_analyzer.py CHANGED
@@ -3,6 +3,8 @@ from typing import Dict, Any, List
3
  from transformers import pipeline
4
  from transformers import AutoTokenizer
5
  import numpy as np
 
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
@@ -54,19 +56,35 @@ class HeadlineAnalyzer:
54
 
55
  def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
56
  """Analyze a single section of content."""
57
- input_text = f"{headline} [SEP] {section}"
58
- result = self.nli_pipeline(input_text, top_k=None)
59
-
60
- # Extract scores
61
- scores = {item['label']: item['score'] for item in result}
62
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  logger.info("\nSection Analysis:")
64
  logger.info("-"*30)
65
  logger.info(f"Section preview: {section[:100]}...")
66
  for label, score in scores.items():
67
  logger.info(f"Label: {label:<12} Score: {score:.3f}")
68
 
69
- return scores
70
 
71
  def analyze(self, headline: str, content: str) -> Dict[str, Any]:
72
  """Analyze how well the headline matches the content using an AI model."""
@@ -146,7 +164,7 @@ class HeadlineAnalyzer:
146
  "headline_vs_content_score": round(final_score, 1),
147
  "entailment_score": round(entailment_score, 2),
148
  "contradiction_score": round(contradiction_score, 2),
149
- "contradictory_phrases": []
150
  }
151
 
152
  except Exception as e:
 
3
  from transformers import pipeline
4
  from transformers import AutoTokenizer
5
  import numpy as np
6
+ import nltk
7
+ from nltk.tokenize import sent_tokenize
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
56
 
57
  def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
58
  """Analyze a single section of content."""
59
+ # Use a more robust method for sentence splitting
60
+ nltk.download('punkt')
61
+ sentences = sent_tokenize(section)
62
+
63
+ flagged_phrases = []
64
+ for sentence in sentences:
65
+ input_text = f"{headline} [SEP] {sentence}"
66
+ result = self.nli_pipeline(input_text, top_k=None)
67
+ scores = {item['label']: item['score'] for item in result}
68
+
69
+ # Log the model output for debugging
70
+ logger.info(f"Sentence: {sentence}")
71
+ logger.info(f"Scores: {scores}")
72
+
73
+ # Set the threshold for contradiction to anything higher than 0.1
74
+ if scores.get('CONTRADICTION', 0) > 0.1: # Threshold set to > 0.1
75
+ flagged_phrases.append(sentence)
76
+
77
+ # Adjust the headline_vs_content_score based on contradictions
78
+ contradiction_penalty = len(flagged_phrases) * 0.1 # Example penalty per contradiction
79
+ adjusted_score = max(0, scores.get('ENTAILMENT', 0) - contradiction_penalty)
80
+
81
  logger.info("\nSection Analysis:")
82
  logger.info("-"*30)
83
  logger.info(f"Section preview: {section[:100]}...")
84
  for label, score in scores.items():
85
  logger.info(f"Label: {label:<12} Score: {score:.3f}")
86
 
87
+ return {"scores": scores, "flagged_phrases": flagged_phrases, "adjusted_score": adjusted_score}
88
 
89
  def analyze(self, headline: str, content: str) -> Dict[str, Any]:
90
  """Analyze how well the headline matches the content using an AI model."""
 
164
  "headline_vs_content_score": round(final_score, 1),
165
  "entailment_score": round(entailment_score, 2),
166
  "contradiction_score": round(contradiction_score, 2),
167
+ "contradictory_phrases": scores.get('flagged_phrases', [])
168
  }
169
 
170
  except Exception as e:
mediaunmasked/analyzers/scoring.py CHANGED
@@ -75,10 +75,25 @@ class MediaScorer:
75
  "media_unmasked_score": round(final_score, 1),
76
  "rating": rating,
77
  "details": {
78
- "headline_analysis": headline_analysis,
79
- "sentiment_analysis": sentiment_analysis,
80
- "bias_analysis": bias_analysis,
81
- "evidence_analysis": evidence_analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
  }
84
 
@@ -93,9 +108,9 @@ class MediaScorer:
93
  "media_unmasked_score": 0,
94
  "rating": "Error",
95
  "details": {
96
- "headline_analysis": {"headline_vs_content_score": 0, "contradictory_phrases": []},
97
  "sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
98
- "bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0},
99
- "evidence_analysis": {"evidence_based_score": 0}
100
  }
101
  }
 
75
  "media_unmasked_score": round(final_score, 1),
76
  "rating": rating,
77
  "details": {
78
+ "headline_analysis": {
79
+ "headline_vs_content_score": headline_analysis["headline_vs_content_score"],
80
+ "flagged_phrases": headline_analysis.get("flagged_phrases", [])
81
+ },
82
+ "sentiment_analysis": {
83
+ "sentiment": sentiment_analysis["sentiment"],
84
+ "manipulation_score": sentiment_analysis["manipulation_score"],
85
+ "flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
86
+ },
87
+ "bias_analysis": {
88
+ "bias": bias_analysis["bias"],
89
+ "bias_score": bias_analysis["bias_score"],
90
+ "bias_percentage": bias_analysis["bias_percentage"],
91
+ "flagged_phrases": bias_analysis.get("flagged_phrases", [])
92
+ },
93
+ "evidence_analysis": {
94
+ "evidence_based_score": evidence_analysis["evidence_based_score"],
95
+ "flagged_phrases": evidence_analysis.get("flagged_phrases", [])
96
+ }
97
  }
98
  }
99
 
 
108
  "media_unmasked_score": 0,
109
  "rating": "Error",
110
  "details": {
111
+ "headline_analysis": {"headline_vs_content_score": 0, "flagged_phrases": []},
112
  "sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
113
+ "bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0, "flagged_phrases": []},
114
+ "evidence_analysis": {"evidence_based_score": 0, "flagged_phrases": []}
115
  }
116
  }
mediaunmasked/scrapers/article_scraper.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Dict, Optional, List
2
  import logging
3
  from urllib.parse import urlparse
4
  import requests
@@ -25,17 +25,15 @@ class ArticleScraper:
25
  response = self.session.get(url)
26
  response.raise_for_status()
27
  return response.text
28
-
29
  except Exception as e:
30
  self.logger.error(f"Error fetching {url}: {str(e)}")
31
  return None
32
 
33
  def _process_element(self, element) -> str:
34
- """Process an HTML element while preserving its structure and formatting."""
35
  if isinstance(element, NavigableString):
36
  return str(element)
37
-
38
- # Handle different types of elements
39
  tag_name = element.name
40
 
41
  if tag_name in ['p', 'div']:
@@ -64,90 +62,63 @@ class ArticleScraper:
64
 
65
  elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
66
  level = int(tag_name[1])
67
- prefix = '#' * (level + 1) # Add one more # to match test expectations
68
  return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
69
 
70
- # For other elements, just process their children
71
  return ''.join(self._process_element(child) for child in element.children)
72
 
73
  def _extract_content(self, container) -> str:
74
  """Extract and format content from a container element."""
75
  if not container:
76
  return ''
77
-
78
- # Remove unwanted elements
79
  for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
80
  unwanted.decompose()
81
-
82
- # Process the container
83
  content = self._process_element(container)
84
 
85
- # Clean up extra whitespace and newlines
86
  content = '\n'.join(line.strip() for line in content.split('\n'))
87
  content = '\n'.join(filter(None, content.split('\n')))
88
 
89
  return content.strip()
90
 
91
- def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
92
- """Extract content from any article, with special handling for known domains."""
93
  try:
94
- # Find headline - try domain-specific selectors first, then fallback to generic
95
- headline = None
96
- headline_selectors = {
97
- 'politifact.com': ['h1.article__title'],
98
- 'snopes.com': ['header h1', 'article h1']
99
- }
100
-
101
- # Try domain-specific headline selectors
102
- if domain in headline_selectors:
103
- for selector in headline_selectors[domain]:
104
- headline = soup.select_one(selector)
105
- if headline:
106
- break
107
-
108
- # Fallback to any h1 if no domain-specific headline found
109
- if not headline:
110
- headline = soup.find('h1')
111
-
112
- headline_text = headline.get_text().strip() if headline else "No headline found"
113
- self.logger.info(f"Found headline: {headline_text}")
114
-
115
- # Find content - try domain-specific selectors first, then fallback to generic
116
- content_div = None
117
- content_selectors = {
118
- 'politifact.com': ['article.article', '.article__text', '.m-textblock'],
119
- 'snopes.com': ['article']
120
- }
121
-
122
- # Try domain-specific content selectors
123
- if domain in content_selectors:
124
- for selector in content_selectors[domain]:
125
- content_div = soup.select_one(selector)
126
- if content_div:
127
- break
128
-
129
- # Fallback to generic content selectors
130
- if not content_div:
131
- for selector in ['article', 'main', '.content', '.article-content']:
132
- content_div = soup.select_one(selector)
133
- if content_div:
134
- break
135
-
136
- content = self._extract_content(content_div) if content_div else "No content found"
137
 
138
- if not content:
139
- self.logger.warning("No content found in article")
140
- self.logger.debug(f"Domain: {domain}")
141
-
142
- return {"headline": headline_text, "content": content}
143
 
 
 
 
 
 
144
  except Exception as e:
145
- self.logger.error(f"Error extracting article content: {str(e)}")
146
  return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
149
  """
150
- Main function to scrape fact-checking articles.
151
  Returns a dictionary with headline and content.
152
  """
153
  html_content = self._fetch_page(url)
@@ -159,4 +130,8 @@ class ArticleScraper:
159
  domain = self._get_domain(url)
160
 
161
  self.logger.info(f"Scraping article from domain: {domain}")
162
- return self._extract_article(soup, domain)
 
 
 
 
 
1
+ from typing import Dict, Optional
2
  import logging
3
  from urllib.parse import urlparse
4
  import requests
 
25
  response = self.session.get(url)
26
  response.raise_for_status()
27
  return response.text
 
28
  except Exception as e:
29
  self.logger.error(f"Error fetching {url}: {str(e)}")
30
  return None
31
 
32
  def _process_element(self, element) -> str:
33
+ """Process an HTML element while preserving structure and formatting."""
34
  if isinstance(element, NavigableString):
35
  return str(element)
36
+
 
37
  tag_name = element.name
38
 
39
  if tag_name in ['p', 'div']:
 
62
 
63
  elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
64
  level = int(tag_name[1])
65
+ prefix = '#' * (level + 1) # Add one more # for clarity
66
  return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
67
 
 
68
  return ''.join(self._process_element(child) for child in element.children)
69
 
70
  def _extract_content(self, container) -> str:
71
  """Extract and format content from a container element."""
72
  if not container:
73
  return ''
74
+
 
75
  for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
76
  unwanted.decompose()
77
+
 
78
  content = self._process_element(container)
79
 
 
80
  content = '\n'.join(line.strip() for line in content.split('\n'))
81
  content = '\n'.join(filter(None, content.split('\n')))
82
 
83
  return content.strip()
84
 
85
+ def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
86
+ """Extract content from PolitiFact articles."""
87
  try:
88
+ headline = soup.find('h1', class_='article__title') or soup.find('h1')
89
+ headline = headline.get_text(strip=True) if headline else "No headline found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ self.logger.info(f"Found headline: {headline}")
 
 
 
 
92
 
93
+ content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
94
+ content = self._extract_content(content_div) if content_div else "No content found"
95
+
96
+ return {"headline": headline, "content": content}
97
+
98
  except Exception as e:
99
+ self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
100
  return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
101
 
102
+ def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
103
+ """Fallback extraction method for unknown domains."""
104
+ headline = soup.find('h1')
105
+ headline_text = headline.get_text().strip() if headline else "No headline found"
106
+
107
+ content_div = None
108
+ common_selectors = ['article', 'main', '.content', '.article-content']
109
+
110
+ for selector in common_selectors:
111
+ content_div = soup.select_one(selector)
112
+ if content_div:
113
+ break
114
+
115
+ content = self._extract_content(content_div) if content_div else "No content found"
116
+
117
+ return {"headline": headline_text, "content": content}
118
+
119
  def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
120
  """
121
+ Main function to scrape articles while maintaining structure.
122
  Returns a dictionary with headline and content.
123
  """
124
  html_content = self._fetch_page(url)
 
130
  domain = self._get_domain(url)
131
 
132
  self.logger.info(f"Scraping article from domain: {domain}")
133
+
134
+ if 'politifact.com' in domain:
135
+ return self._extract_politifact(soup)
136
+
137
+ return self._extract_generic(soup, domain)