File size: 4,208 Bytes
50e7434
 
 
 
 
 
 
 
f1f0d5a
 
50e7434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b219ba8
 
 
 
 
 
50e7434
4a58058
 
 
50e7434
0ae976a
 
 
50e7434
 
 
 
 
 
 
 
 
0ae976a
 
 
 
 
 
 
 
 
 
50e7434
f1f0d5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import json
import tempfile
import os
import difflib

def compare_texts(correct_file, wrong_file):
    """讀取並返回文件內容"""
    correct_text = correct_file.decode('utf-8')
    wrong_text = wrong_file.decode('utf-8')

    # 比較兩個文本並找出不同的位置
    s = difflib.SequenceMatcher(None, wrong_text, correct_text)
    diff_positions = []
    correct_text_with_positions = ""
    wrong_text_with_positions = ""
    wrong_ids = []

    for i, (tag, i1, i2, j1, j2) in enumerate(s.get_opcodes()):
        if tag == "equal":
            correct_text_with_positions += correct_text[j1:j2]
            wrong_text_with_positions += wrong_text[i1:i2]
        else:
            correct_text_segment = correct_text[j1:j2]
            wrong_text_segment = wrong_text[i1:i2]
            # 對於不一緻的部分,添加標記
            wrong_ids.extend(range(i1, i2))
            correct_text_with_positions += f"[{correct_text_segment}]"
            wrong_text_with_positions += f"[{wrong_text_segment}]"

    wrong_ids_str = ",".join(map(str, wrong_ids))

    return correct_text_with_positions, wrong_text_with_positions, wrong_ids_str

def generate_json(file_id, correct_text, wrong_text, wrong_ids):
    """根據輸入生成 JSON 數據,並保存為文件"""
    wrong_ids_list = [int(x) for x in wrong_ids.split(',')]
    data = {
        "id": file_id,
        "original_text": wrong_text,
        "wrong_ids": wrong_ids_list,
        "correct_text": correct_text
    }
    json_data = json.dumps([data], ensure_ascii=False, indent=4)
    
    # 創建臨時文件來保存 JSON 數據
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
    temp_file.write(json_data)
    temp_file.close()
    
    return json_data, temp_file.name

TITLE = """<h1>逐字稿文本內容比對工具</h1>"""
SUBTITLE = """<h2 align="center"><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. @ 2024/04 </a><br></h2>"""
LINKS = """<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a> | <a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a> | <a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PaddleOCR的PPOCRLabel來微調醫療診斷書和收據</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>"""


with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.HTML(SUBTITLE)
    gr.HTML(LINKS)
    file_id_input = gr.Textbox(label="請輸入文件ID")
    # 使用 binary 作为文件类型,这样上传的文件将以二进制形式传递给函数
    correct_file = gr.File(label="上傳校正過的逐字稿文本文件", type="binary")
    wrong_file = gr.File(label="上傳未校正的ASR辨識文本文件", type="binary")
    compare_button = gr.Button("比較文本")
    correct_text_output = gr.TextArea(label="校正過的逐字稿文本內容")
    wrong_text_output = gr.TextArea(label="未校正的ASR辨識文本內容")
    wrong_ids_output = gr.Textbox(label="錯誤的文字位置")
    
    generate_button = gr.Button("生成 JSON 文件")
    json_output = gr.Text(label="JSON 輸出")
    json_download_link = gr.File(label="下載 JSON 文件")
    
    compare_button.click(
        compare_texts, 
        inputs=[correct_file, wrong_file], 
        outputs=[correct_text_output, wrong_text_output, wrong_ids_output]
    )
    generate_button.click(
        generate_json, 
        inputs=[file_id_input, correct_text_output, wrong_text_output, wrong_ids_output], 
        outputs=[json_output, json_download_link]
    )

demo.launch()