Spaces:

Hyeonsieun
/

Audio-to-LaTeX

Runtime error

App Files Files Community

Hyeonsieun commited on Feb 21, 2024

Commit

afedaa3

verified ·

1 Parent(s): d63114d

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+import gradio as gr
+from transformers import pipeline
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import re
+import os
+import json
+import requests
+import whisper
+from yt_dlp import YoutubeDL
+import matplotlib as plt
+#whisper_model = whisper.load_model('small')
+path = "Hyeonsieun/NTtoGT_1epoch"
+tokenizer = T5Tokenizer.from_pretrained(path)
+model = T5ForConditionalGeneration.from_pretrained(path)
+MODEL_NAME = "openai/whisper-large-v2"
+BATCH_SIZE = 8
+#FILE_LIMIT_MB = 1000
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
+)
+def transcribe(inputs):
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
+    return  text
+def remove_spaces_within_dollar(text):
+    # 달러 기호로 둘러싸인 부분에서 스페이스 제거
+    # 정규 표현식: \$.*?\$ 는 '$'로 시작해서 '$'로 끝나는 최소한의 문자열을 찾음 (non-greedy)
+    # re.sub의 repl 파라미터에 함수를 사용하여 매치된 부분에서만 변경을 적용
+    result = re.sub(r'\$(.*?)\$', lambda match: match.group(0).replace(' ', ''), text)
+    return result
+def audio_correction(file):
+    ASR_result = transcribe(file)
+    text_list = split_text_complex_rules_with_warning(ASR_result)
+    whole_text = ''
+    for text in text_list:
+        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
+        inputs = tokenizer.encode(
+            input_text,
+            return_tensors='pt',
+            max_length=325,
+            padding='max_length',
+            truncation=True
+        )
+        # Get correct sentence ids.
+        corrected_ids = model.generate(
+            inputs,
+            max_length=325,
+            num_beams=5, # `num_beams=1` indicated temperature sampling.
+            early_stopping=True
+        )
+        # Decode.
+        corrected_sentence = tokenizer.decode(
+            corrected_ids[0],
+            skip_special_tokens=False
+        )
+        whole_text += corrected_sentence
+    return remove_spaces_within_dollar(whole_text)[5:-4]
+def youtubeASR(link):
+    # 유튜브의 음성만 다운로드할 임시 파일명
+    out_fn = 'temp1.mp3'
+    ydl_opts = {
+        'format': 'bestaudio/best', # Audio만 다운로드
+        'outtmpl': out_fn,          # 지정한 파일명으로 저장
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        ydl.download([link])
+    result = pipe(out_fn, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]      # Youtube에서 받은 음성 파일(out_fn)을 받아쓰기
+    script = result['text']            # 받아쓰기 한 내용 저장
+    return script
+def split_text_complex_rules_with_warning(text):
+    # 콤마를 제외한 구두점으로 문장 분리
+    parts = re.split(r'(?<=[.?!])\s+', text)
+    result = []
+    warnings = []  # 경고 메시지를 저장할 리스트
+    for part in parts:
+        # 각 부분의 길이가 256자를 초과하는 경우 콤마로 추가 분리
+        if len(part) > 256:
+            subparts = re.split(r',\s*', part)
+            for subpart in subparts:
+                # 빈 문자열 제거 및 길이가 256자 이하인 경우만 결과 리스트에 추가
+                trimmed_subpart = subpart.strip()
+                if trimmed_subpart and len(trimmed_subpart) <= 256:
+                    result.append(trimmed_subpart)
+                else:
+                    # 길이가 256자를 초과하는 경우 경고 메시지 추가
+                    warnings.append(f"문장 길이가 256자를 초과합니다: {trimmed_subpart[:50]}... (길이: {len(trimmed_subpart)})")
+        else:
+            # 길이가 256자 이하인 경우 바로 결과 리스트에 추가
+            result.append(part.strip())
+    warnings = 0
+    return result
+def youtube_correction(link):
+    ASR_result = youtubeASR(link)
+    text_list = split_text_complex_rules_with_warning(ASR_result)
+    whole_text = ''
+    for text in text_list:
+        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
+        inputs = tokenizer.encode(
+            input_text,
+            return_tensors='pt',
+            max_length=325,
+            padding='max_length',
+            truncation=True
+        )
+        # Get correct sentence ids.
+        corrected_ids = model.generate(
+            inputs,
+            max_length=325,
+            num_beams=5, # `num_beams=1` indicated temperature sampling.
+            early_stopping=True
+        )
+        # Decode.
+        corrected_sentence = tokenizer.decode(
+            corrected_ids[0],
+            skip_special_tokens=False
+        )
+        whole_text += corrected_sentence
+    return remove_spaces_within_dollar(whole_text)[5:-4]
+demo = gr.Blocks()
+file_transcribe = gr.Interface(
+    fn=audio_correction,
+    inputs=gr.components.Audio(sources="upload", type="filepath"),
+    outputs="text"
+    )
+yt_transcribe = gr.Interface(
+    fn=youtube_correction,
+    inputs="text",
+    outputs="text"
+    )
+with demo:
+    gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])
+demo.launch()