Spaces:
Sleeping
Sleeping
| from difflib import SequenceMatcher | |
| import difflib | |
| from src.application.highlight_text import generate_color | |
| from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image | |
| from src.application.text.model_detection import detect_text_by_ai_model | |
| from src.application.text.preprocessing import split_into_sentences | |
| from src.application.text.search_detection import check_human, detect_text_by_relative_search | |
| class NewsVerification(): | |
| def __init__(self): | |
| self.news_text = "" | |
| self.news_title = "" | |
| self.news_content = "" | |
| self.news_image = "" | |
| self.text_prediction_label:list[str] = [] | |
| self.text_prediction_score:list[float] = [] | |
| self.text_referent_url:list[str] = [] | |
| self.image_prediction_label:list[str] = [] | |
| self.image_prediction_score:list[str] = [] | |
| self.image_referent_url:list[str] = [] | |
| self.news_prediction_label = "" | |
| self.news_prediction_score = -1 | |
| self.found_img_url:list[str] = [] | |
| self.aligned_sentences:list[dict] = [] | |
| self.is_paraphrased:list[bool] = [] | |
| self.analyzed_table:list[list] = [] | |
| def load_news(self, news_title, news_content, news_image): | |
| self.news_text = news_title + "\n\n" + news_content | |
| self.news_title = news_title | |
| self.news_content = news_content | |
| self.news_image = news_image | |
| def determine_text_origin(self): | |
| """ | |
| Determines the origin of the given text based on paraphrasing detection and human authorship analysis. | |
| Args: | |
| text: The input text to be analyzed. | |
| Returns: | |
| str: The predicted origin of the text: | |
| - "HUMAN": If the text is likely written by a human. | |
| - "MACHINE": If the text is likely generated by a machine. | |
| """ | |
| print("CHECK TEXT:") | |
| print("\tFrom search engine:") | |
| # Classify by search engine | |
| input_sentences = split_into_sentences(self.news_text) | |
| for sentence in input_sentences: | |
| paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence) | |
| text_prediction_label = "UNKNOWN" | |
| if paraphrase is False: | |
| # Classify text by AI model | |
| print("\tFrom AI model:") | |
| text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence) | |
| if aligned_sentence == []: | |
| aligned_sentence = { | |
| "input_sentence": sentence, | |
| "matched_sentence": "", | |
| "similarity": text_prediction_score, | |
| "is_paraphrase_sentence": False, | |
| "url": "", | |
| } | |
| else: | |
| self.found_img_url.extend(img_urls) | |
| text_prediction_score = aligned_sentence["similarity"] | |
| if check_human(aligned_sentence): | |
| text_prediction_label = "HUMAN" | |
| else: | |
| text_prediction_label = "MACHINE" | |
| print(f"\ttext_prediction_label: {text_prediction_label}\n") | |
| self.text_prediction_label.append(text_prediction_label) | |
| self.aligned_sentences.append(aligned_sentence) | |
| self.is_paraphrased.append(paraphrase) | |
| self.text_referent_url.append(text_url) | |
| self.text_prediction_score.append(text_prediction_score) | |
| paraphrase = False | |
| text_url = "" | |
| aligned_sentence = {} | |
| img_urls = [] | |
| def detect_image_origin(self): | |
| print("CHECK IMAGE:") | |
| if self.news_image is None: | |
| self.image_prediction_label = "UNKNOWN" | |
| self.image_prediction_score = 0.0 | |
| self.image_referent_url = None | |
| return | |
| print(f"\t: Img path: {self.news_image}") | |
| matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) | |
| if matched_url is not None: | |
| print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
| self.image_prediction_label = "HUMAN" | |
| self.image_prediction_score = similarity | |
| self.image_referent_url = matched_url | |
| return | |
| matched_url, similarity = detect_image_by_reverse_search(self.news_image) | |
| if matched_url is not None: | |
| print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
| self.image_prediction_label = "HUMAN" | |
| self.image_prediction_score = similarity | |
| self.image_referent_url = matched_url | |
| return | |
| detected_label, score = detect_image_by_ai_model(self.news_image) | |
| if detected_label: | |
| self.image_prediction_label = detected_label | |
| self.image_prediction_score = score | |
| self.image_referent_url = None | |
| return | |
| self.image_prediction_label = "UNKNOWN" | |
| self.image_prediction_score = 50 | |
| self.image_referent_url = None | |
| def determine_news_origin(self): | |
| if self.text_prediction_label == "MACHINE": | |
| text_prediction_score = 100 - self.text_prediction_score | |
| elif self.text_prediction_label == "UNKNOWN": | |
| text_prediction_score = 50 | |
| else: | |
| text_prediction_score = self.text_prediction_score | |
| if self.image_prediction_label == "MACHINE": | |
| image_prediction_score = 100 - self.image_prediction_score | |
| elif self.image_prediction_label == "UNKNOWN": | |
| image_prediction_score = 50 | |
| else: | |
| image_prediction_score = self.image_prediction_score | |
| news_prediction_score = (text_prediction_score + image_prediction_score) / 2 | |
| if news_prediction_score > 50: | |
| self.news_prediction_score = news_prediction_score | |
| self.news_prediction_label = "HUMAN" | |
| else: | |
| self.news_prediction_score = 100 - news_prediction_score | |
| self.news_prediction_label = "MACHINE" | |
| def generate_analysis_report(self): | |
| self.determine_text_origin() | |
| self.detect_image_origin() | |
| def analyze_details(self): | |
| self.analyzed_table = [] | |
| # IMAGES: | |
| # TEXT | |
| for pair in self.aligned_sentences: | |
| print(f"pair: {pair}") | |
| if "input_sentence" not in pair: | |
| continue | |
| input_words, source_words, input_indexes, source_indexes = ( | |
| self.highlight_overlap_by_word_to_list( | |
| pair["input_sentence"], | |
| pair["matched_sentence"], | |
| ) | |
| # self.compare_sentences( | |
| # pair["input_sentence"], | |
| # pair["matched_sentence"], | |
| # ) | |
| ) | |
| self.analyzed_table.append( | |
| (input_words, source_words, input_indexes, source_indexes), | |
| ) | |
| if len(self.analyzed_table) != 0: | |
| html_table = self.create_table() | |
| else: | |
| html_table = "" | |
| return html_table | |
| def highlight_overlap_by_word_to_list(self, text1, text2): | |
| """ | |
| Return | |
| - list of words in text1 | |
| - list of words in text2 | |
| - list of index of highlight words in text 1 | |
| - list of index of highlight words in text 2 | |
| """ | |
| # Tách chuỗi thành các từ (word) dựa vào khoảng trắng | |
| words1 = text1.split() | |
| words2 = text2.split() | |
| index1 = [] | |
| index2 = [] | |
| # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ | |
| matcher = SequenceMatcher(None, words1, words2) | |
| highlighted_text1 = [] | |
| highlighted_text2 = [] | |
| # Theo dõi vị trí hiện tại trong words1 và words2 | |
| current_pos1 = 0 | |
| current_pos2 = 0 | |
| # Lặp qua các đoạn so khớp | |
| for match in matcher.get_matching_blocks(): | |
| start1, start2, length = match | |
| print(start1, start2, length) | |
| # Thêm các từ không trùng lặp vào (giữ nguyên) | |
| highlighted_text1.extend(words1[current_pos1:start1]) | |
| highlighted_text2.extend(words2[current_pos2:start2]) | |
| if length > 0: | |
| for i in range(start1, start1 + length): | |
| index1.append(i) | |
| for i in range(start2, start2 + length): | |
| index2.append(i) | |
| # Cập nhật vị trí hiện tại | |
| current_pos1 = start1 + length | |
| current_pos2 = start2 + length | |
| return words1, words2, index1, index2 | |
| def get_text_urls(self): | |
| return set(self.text_referent_url) | |
| def generate_colors_list(self, set_urls): | |
| color_dict = {} | |
| num_urls = len(set_urls) | |
| for i in range(num_urls): | |
| color_dict[i] = generate_color(i, num_urls) | |
| return color_dict | |
| def analyze_details_2(self): | |
| html_text = "" | |
| self.analyzed_table = [] | |
| # TEXT | |
| # Assign unique colors to each index | |
| set_urls = self.get_text_urls() | |
| color_dict = self.generate_colors_list(set_urls) | |
| # position of the color in the input contents | |
| position = 0 | |
| for pair in self.aligned_sentences: | |
| if "input_sentence" not in pair: | |
| continue | |
| common_phrases, position = self.compare_sentences( | |
| pair["input_sentence"], | |
| pair["matched_sentence"], | |
| position, | |
| color_dict["0"], # TODO: set color | |
| ) | |
| if len(self.analyzed_table) != 0: | |
| html_table = self.create_table() | |
| else: | |
| html_table = "" | |
| return html_text, html_table | |
| def compare_sentences(self, sentence_1, sentence_2, position, color): | |
| """ | |
| Compares two sentences and identifies common phrases, outputting their start and end positions. | |
| Args: | |
| sentence_1: The first sentence (string). | |
| sentence_2: The second sentence (string). | |
| Returns: | |
| A list of dictionaries, where each dictionary represents a common phrase and contains: | |
| - "phrase": The common phrase (string). | |
| - "start_1": The starting index of the phrase in sentence_1 (int). | |
| - "end_1": The ending index of the phrase in sentence_1 (int). | |
| - "start_2": The starting index of the phrase in sentence_2 (int). | |
| - "end_2": The ending index of the phrase in sentence_2 (int). | |
| Returns an empty list if no common phrases are found. Handles edge cases like empty strings. | |
| """ | |
| if not sentence_1 or not sentence_2: # Handle empty strings | |
| return [] | |
| s = difflib.SequenceMatcher(None, sentence_1, sentence_2) | |
| common_phrases = [] | |
| for block in s.get_matching_blocks(): | |
| if block.size > 0: # Ignore zero-length matches | |
| start_1 = block.a | |
| end_1 = block.a + block.size | |
| start_2 = block.b | |
| end_2 = block.b + block.size | |
| phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same | |
| common_phrases.append({ | |
| "phrase": phrase, | |
| "start_1": start_1 + position, | |
| "end_1": end_1 + position, | |
| "start_2": start_2, | |
| "end_2": end_2, | |
| "color": color, | |
| }) | |
| position += len(sentence_1) | |
| return common_phrases, position | |
| def create_table(self): | |
| #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table]) | |
| # loop of self.analyzed_table with index: | |
| rows = [] | |
| max_length = 30 # TODO: put this in configuration | |
| rows.append(self.format_image_row(max_length)) | |
| for index, row in enumerate(self.analyzed_table): | |
| formatted_row = self.format_text_row(row, index, max_length) | |
| rows.append(formatted_row) | |
| table = "\n".join(rows) | |
| return f""" | |
| <h5>Comparison between input news and source news</h5> | |
| <table border="1" style="width:100%; text-align:left; border-collapse:collapse;"> | |
| <thead> | |
| <tr> | |
| <th>Input news</th> | |
| <th>Source content</th> | |
| <th>Forensic</th> | |
| <th>Originality</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {table} | |
| </tbody> | |
| </table> | |
| <style> | |
| """ | |
| def format_text_row(self, row, index = 0, max_length=30): | |
| input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words | |
| source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words | |
| url = self.aligned_sentences[index]["url"] # | |
| short_url = self.shorten_url(url, max_length) | |
| source_text_url = f"""<a href="{url}">{short_url}</a>""" | |
| # short_url = self.shorten_url(self.text_referent_url[index], max_length) | |
| # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>""" | |
| self.text_prediction_score[index] | |
| return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>""" | |
| def format_image_row(self, max_length=30): | |
| # input_image = f"""<img src="{self.news_image}" width="200" height="150">""" | |
| print(f"self.news_image = {self.news_image}") | |
| source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" | |
| short_url = self.shorten_url(self.image_referent_url, max_length) | |
| source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>""" | |
| return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" | |
| def shorten_url(self, url, max_length=30): | |
| if url is None: | |
| return "" | |
| if len(url) > max_length: | |
| short_url = url[:max_length] + "..." | |
| else: | |
| short_url = url | |
| return short_url | |
| def highlight_text(self, words, indexes): | |
| final_words = words | |
| for index in indexes: | |
| final_words[index] = ( | |
| f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>" | |
| ) | |
| return " ".join(final_words) | |