Spaces:

DeepLearning101
/

TEXT-CORRECT-ANNOTATION

Running

File size: 7,773 Bytes

import gradio as gr
import json
import tempfile
import os
import difflib

def compare_texts(correct_file, wrong_file):
    """讀取並返回文件內容"""
    correct_text = correct_file.decode('utf-8')
    wrong_text = wrong_file.decode('utf-8')

    # 比較兩個文本並找出不同的位置
    s = difflib.SequenceMatcher(None, wrong_text, correct_text)
    diff_positions = []
    correct_text_with_positions = ""
    wrong_text_with_positions = ""
    wrong_ids = []

    for i, (tag, i1, i2, j1, j2) in enumerate(s.get_opcodes()):
        if tag == "equal":
            correct_text_with_positions += correct_text[j1:j2]
            wrong_text_with_positions += wrong_text[i1:i2]
        else:
            correct_text_segment = correct_text[j1:j2]
            wrong_text_segment = wrong_text[i1:i2]
            # 對於不一緻的部分，添加標記
            wrong_ids.extend(range(i1, i2))
            correct_text_with_positions += f"[{correct_text_segment}]"
            wrong_text_with_positions += f"[{wrong_text_segment}]"

    wrong_ids_str = ",".join(map(str, wrong_ids))

    return correct_text_with_positions, wrong_text_with_positions, wrong_ids_str

def generate_json(file_id, correct_text, wrong_text, wrong_ids):
    """根據輸入生成 JSON 數據，並保存為文件"""
    wrong_ids_list = [int(x) for x in wrong_ids.split(',')]
    data = {
        "id": file_id,
        "original_text": wrong_text,
        "wrong_ids": wrong_ids_list,
        "correct_text": correct_text
    }
    json_data = json.dumps([data], ensure_ascii=False, indent=4)
    
    # 創建臨時文件來保存 JSON 數據
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
    temp_file.write(json_data)
    temp_file.close()
    
    return json_data, temp_file.name

TITLE = """<h1>逐字稿文本內容比對工具</h1>"""
SUBTITLE = """<h2><a href='https://deep-learning-101.github.io' target='_blank'>deep-learning-101.github.io</a> | <a href='https://www.twman.org/AI' target='_blank'> AI </a> | <a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2>"""
LINKS = """
<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
<a href='https://blog.twman.org/2025/04/AI-Robot.html' target='_blank'>AI 陪伴機器人：2025 趨勢分析技術突破、市場潛力與未來展望</a> | <a href='https://blog.twman.org/2025/04/FinanceGenAI.html' target='_blank'>金融科技新浪潮：生成式 AI (GenAI) 應用場景、效益與導入挑戰</a><br>
<a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (實戰經驗)</a>：<a href="https://deep-learning-101.github.io/agent" target="_blank">探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。</a><br>
<a href="https://blog.twman.org/2024/08/LLM.html" target="_blank">白話文手把手帶你科普 GenAI</a></b>：<a href="https://deep-learning-101.github.io/GenAI" target="_blank">淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。</a><br>
<a href="https://blog.twman.org/2024/09/LLM.html" target="_blank">大型語言模型直接就打完收工？</a></b>：<a href="https://deep-learning-101.github.io/1010LLM" target="_blank">回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。</a><br>
<a href="https://blog.twman.org/2024/07/RAG.html" target="_blank">檢索增強生成(RAG)不是萬靈丹之優化挑戰技巧</a></b>：<a href="https://deep-learning-101.github.io/RAG" target="_blank">探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。</a><br>
<a href="https://blog.twman.org/2024/02/LLM.html" target="_blank">大型語言模型 (LLM) 入門完整指南：原理、應用與未來</a></b>：<a href="https://deep-learning-101.github.io/0204LLM" target="_blank">探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。</a><br>
<a href="https://blog.twman.org/2023/04/GPT.html" target="_blank">解析探索大型語言模型：模型發展歷史、訓練及微調技術的 VRAM 估算</a></b>：<a href="https://deep-learning-101.github.io/GPU" target="_blank">探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。</a><br>
<a href="https://blog.twman.org/2024/11/diffusion.html" target="_blank">Diffusion Model 完全解析：從原理、應用到實作 (AI 圖像生成)</a></b>；<a href="https://deep-learning-101.github.io/diffusion" target="_blank">深入探討影像生成與分割技術的應用，強調硬體資源的重要性。</a><br>
<a href="https://blog.twman.org/2024/02/asr-tts.html" target="_blank">ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a></b>：<a href="https://deep-learning-101.github.io/asr-tts" target="_blank">探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。</a><br>
<a href="https://blog.twman.org/2021/04/NLP.html" target="_blank">那些 NLP 踩的坑</a></b>：<a href="https://deep-learning-101.github.io/nlp" target="_blank">分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。</a><br>
<a href="https://blog.twman.org/2021/04/ASR.html" target="_blank">那些語音處理踩的坑</a></b>：<a href="https://deep-learning-101.github.io/speech" target="_blank">分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。</a><br>
<a href="https://blog.twman.org/2020/05/DeepLearning.html" target="_blank">手把手學深度學習安裝環境</a></b>：<a href="https://deep-learning-101.github.io/101" target="_blank">詳細介紹在 Ubuntu 上安裝深度學習環境的步驟，分享實際操作經驗。</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
<a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""

with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.HTML(SUBTITLE)
    gr.HTML(LINKS)
    file_id_input = gr.Textbox(label="請輸入文件ID")
    # 使用 binary 作为文件类型，这样上传的文件将以二进制形式传递给函数
    correct_file = gr.File(label="上傳校正過的逐字稿文本文件", type="binary")
    wrong_file = gr.File(label="上傳未校正的ASR辨識文本文件", type="binary")
    compare_button = gr.Button("比較文本")
    correct_text_output = gr.TextArea(label="校正過的逐字稿文本內容")
    wrong_text_output = gr.TextArea(label="未校正的ASR辨識文本內容")
    wrong_ids_output = gr.Textbox(label="錯誤的文字位置")
    
    generate_button = gr.Button("生成 JSON 文件")
    json_output = gr.Text(label="JSON 輸出")
    json_download_link = gr.File(label="下載 JSON 文件")
    
    compare_button.click(
        compare_texts, 
        inputs=[correct_file, wrong_file], 
        outputs=[correct_text_output, wrong_text_output, wrong_ids_output]
    )
    generate_button.click(
        generate_json, 
        inputs=[file_id_input, correct_text_output, wrong_text_output, wrong_ids_output], 
        outputs=[json_output, json_download_link]
    )

demo.launch()