from difflib import SequenceMatcher import difflib import string from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image from src.application.text.entity import apply_highlight, highlight_entities from src.application.text.model_detection import detect_text_by_ai_model from src.application.text.preprocessing import split_into_sentences from src.application.text.search_detection import check_human, detect_text_by_relative_search class NewsVerification(): def __init__(self): self.news_text = "" self.news_title = "" self.news_content = "" self.news_image = "" self.text_prediction_label:list[str] = [] self.text_prediction_score:list[float] = [] self.text_referent_url:list[str] = [] self.image_prediction_label:list[str] = [] self.image_prediction_score:list[str] = [] self.image_referent_url:list[str] = [] self.news_prediction_label = "" self.news_prediction_score = -1 self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"] self.aligned_sentences:list[dict] = [] self.is_paraphrased:list[bool] = [] self.analyzed_table:list[list] = [] def load_news(self, news_title, news_content, news_image): self.news_text = news_title + "\n\n" + news_content self.news_title = news_title self.news_content = news_content self.news_image = news_image def determine_text_origin(self): """ Determines the origin of the given text based on paraphrasing detection and human authorship analysis. Args: text: The input text to be analyzed. Returns: str: The predicted origin of the text: - "HUMAN": If the text is likely written by a human. - "MACHINE": If the text is likely generated by a machine. """ print("CHECK TEXT:") print("\tFrom search engine:") # Classify by search engine input_sentences = split_into_sentences(self.news_text) current_index = 0 previous_paraphrase = None ai_sentence = { "input_sentence": "", "matched_sentence": "", "label": "", "similarity": None, "paraphrase": False, "url": "", } for index, sentence in enumerate(input_sentences): print(f"-------index = {index}-------") print(f"current_sentence = {input_sentences[index]}") if current_index >= len(input_sentences): break if current_index >= index and index != 0 and index != len(input_sentences) - 1: continue paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index) if paraphrase is False: # add sentence to ai_sentence if ai_sentence["input_sentence"] != "": ai_sentence["input_sentence"] += "
" ai_sentence["input_sentence"] += sentence if index == len(input_sentences) - 1: # add ai_sentences to align_sentences text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"]) ai_sentence["label"] = text_prediction_label ai_sentence["similarity"] = text_prediction_score self.aligned_sentences.append(ai_sentence) else: if previous_paraphrase is False or previous_paraphrase is None: # add ai_sentences to align_sentences if ai_sentence["input_sentence"] != "": text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"]) ai_sentence["label"] = text_prediction_label ai_sentence["similarity"] = text_prediction_score self.aligned_sentences.append(ai_sentence) # reset ai_sentence = { "input_sentence": "", "matched_sentence": "", "label": "", "similarity": None, "paraphrase": False, "url": "", } # add searched_sentences to align_sentences if searched_sentences["input_sentence"] != "": self.found_img_url.extend(img_urls) if check_human(searched_sentences): searched_sentences["label"] = "HUMAN" else: searched_sentences["label"] = "MACHINE" self.aligned_sentences.append(searched_sentences) previous_paraphrase = paraphrase def detect_image_origin(self): print("CHECK IMAGE:") if self.news_image is None: self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 0.0 self.image_referent_url = None return for image in self.found_img_url: print(f"\tfound_img_url: {image}") matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return matched_url, similarity = detect_image_by_reverse_search(self.news_image) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return detected_label, score = detect_image_by_ai_model(self.news_image) if detected_label: print(f"detected_label: {detected_label} ({score})") self.image_prediction_label = detected_label self.image_prediction_score = score self.image_referent_url = None return self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 50 self.image_referent_url = None def determine_news_origin(self): if self.text_prediction_label == "MACHINE": text_prediction_score = 100 - self.text_prediction_score elif self.text_prediction_label == "UNKNOWN": text_prediction_score = 50 else: text_prediction_score = self.text_prediction_score if self.image_prediction_label == "MACHINE": image_prediction_score = 100 - self.image_prediction_score elif self.image_prediction_label == "UNKNOWN": image_prediction_score = 50 else: image_prediction_score = self.image_prediction_score news_prediction_score = (text_prediction_score + image_prediction_score) / 2 if news_prediction_score > 50: self.news_prediction_score = news_prediction_score self.news_prediction_label = "HUMAN" else: self.news_prediction_score = 100 - news_prediction_score self.news_prediction_label = "MACHINE" def generate_analysis_report(self): self.determine_text_origin() self.detect_image_origin() def analyze_details(self): self.analyzed_table = [] for aligned_sentence in self.aligned_sentences: if "input_sentence" not in aligned_sentence: continue # Get index of equal phrases in input and source sentences equal_idx_1, equal_idx_2 = self.extract_equal_text( aligned_sentence["input_sentence"], aligned_sentence["matched_sentence"], ) # Get entity-words (in pair) with colors entities_with_colors = highlight_entities( aligned_sentence["input_sentence"], aligned_sentence["matched_sentence"], ) self.analyzed_table.append( [ aligned_sentence["input_sentence"], aligned_sentence["matched_sentence"], equal_idx_1, equal_idx_2, entities_with_colors, ] ) if len(self.analyzed_table) != 0: html_table = self.create_table() else: html_table = "" return html_table def extract_equal_text(self, text1, text2): def cleanup(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) return text splited_text1 = cleanup(text1).split() splited_text2 = cleanup(text2).split() s = SequenceMatcher(None, splited_text1, splited_text2) equal_idx_1 = [] equal_idx_2 = [] text1 = text1.split() text2 = text2.split() for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': equal_idx_1.append({"start": i1, "end": i2}) equal_idx_2.append({"start": j1, "end": j2}) # subtext_1 = " ".join(text1[i1:i2]) # subtext_2 = " ".join(text2[j1:j2]) # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}') return equal_idx_1, equal_idx_2 def get_text_urls(self): return set(self.text_referent_url) def compare_sentences(self, sentence_1, sentence_2, position, color): """ Compares two sentences and identifies common phrases, outputting their start and end positions. Args: sentence_1: The first sentence (string). sentence_2: The second sentence (string). Returns: A list of dictionaries, where each dictionary represents a common phrase and contains: - "phrase": The common phrase (string). - "start_1": The starting index of the phrase in sentence_1 (int). - "end_1": The ending index of the phrase in sentence_1 (int). - "start_2": The starting index of the phrase in sentence_2 (int). - "end_2": The ending index of the phrase in sentence_2 (int). Returns an empty list if no common phrases are found. Handles edge cases like empty strings. """ if not sentence_1 or not sentence_2: # Handle empty strings return [] s = difflib.SequenceMatcher(None, sentence_1, sentence_2) common_phrases = [] for block in s.get_matching_blocks(): if block.size > 0: # Ignore zero-length matches start_1 = block.a end_1 = block.a + block.size start_2 = block.b end_2 = block.b + block.size phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same common_phrases.append({ "phrase": phrase, "start_1": start_1 + position, "end_1": end_1 + position, "start_2": start_2, "end_2": end_2, "color": color, }) position += len(sentence_1) return common_phrases, position def create_table(self): #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table]) # loop of self.analyzed_table with index: rows = [] max_length = 30 # TODO: put this in configuration rows.append(self.format_image_row(max_length)) for index, row in enumerate(self.analyzed_table): formatted_row = self.format_text_row(row, index, max_length) rows.append(formatted_row) table = "\n".join(rows) return f"""

Comparison between input news and source news

{table}

Input news	Source (URL provided in Originality column correspondingly)	Forensic	Originality