Spaces:

Hyeonsieun
/

Audio-to-LaTeX

Runtime error

File size: 5,399 Bytes

afedaa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87ce87a
afedaa3

import torch

import gradio as gr
from transformers import pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer

import re
import os
import json
import requests
import whisper
from yt_dlp import YoutubeDL

import matplotlib as plt

#whisper_model = whisper.load_model('small')

path = "Hyeonsieun/NTtoGT_7epoch"
tokenizer = T5Tokenizer.from_pretrained(path)
model = T5ForConditionalGeneration.from_pretrained(path)


MODEL_NAME = "openai/whisper-large-v2"
BATCH_SIZE = 8
#FILE_LIMIT_MB = 1000

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
)


def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    return  text

def remove_spaces_within_dollar(text):
    # 달러 기호로 둘러싸인 부분에서 스페이스 제거
    # 정규 표현식: \$.*?\$ 는 '$'로 시작해서 '$'로 끝나는 최소한의 문자열을 찾음 (non-greedy)
    # re.sub의 repl 파라미터에 함수를 사용하여 매치된 부분에서만 변경을 적용
    result = re.sub(r'\$(.*?)\$', lambda match: match.group(0).replace(' ', ''), text)
    return result


def audio_correction(file):
    ASR_result = transcribe(file)
    text_list = split_text_complex_rules_with_warning(ASR_result)
    whole_text = ''
    for text in text_list:
        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
        inputs = tokenizer.encode(
            input_text,
            return_tensors='pt',
            max_length=325,
            padding='max_length',
            truncation=True
        )
        # Get correct sentence ids.
        corrected_ids = model.generate(
            inputs,
            max_length=325,
            num_beams=5, # `num_beams=1` indicated temperature sampling.
            early_stopping=True
        )
        # Decode.
        corrected_sentence = tokenizer.decode(
            corrected_ids[0],
            skip_special_tokens=False
        )
        whole_text += corrected_sentence

    return remove_spaces_within_dollar(whole_text)[5:-4]

def youtubeASR(link):
    # 유튜브의 음성만 다운로드할 임시 파일명
    out_fn = 'temp1.mp3'

    ydl_opts = {
        'format': 'bestaudio/best', # Audio만 다운로드
        'outtmpl': out_fn,          # 지정한 파일명으로 저장
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([link])

    result = pipe(out_fn, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]      # Youtube에서 받은 음성 파일(out_fn)을 받아쓰기
    script = result['text']            # 받아쓰기 한 내용 저장
    return script

def split_text_complex_rules_with_warning(text):
    # 콤마를 제외한 구두점으로 문장 분리
    parts = re.split(r'(?<=[.?!])\s+', text)

    result = []
    warnings = []  # 경고 메시지를 저장할 리스트
    for part in parts:
        # 각 부분의 길이가 256자를 초과하는 경우 콤마로 추가 분리
        if len(part) > 256:
            subparts = re.split(r',\s*', part)
            for subpart in subparts:
                # 빈 문자열 제거 및 길이가 256자 이하인 경우만 결과 리스트에 추가
                trimmed_subpart = subpart.strip()
                if trimmed_subpart and len(trimmed_subpart) <= 256:
                    result.append(trimmed_subpart)
                else:
                    # 길이가 256자를 초과하는 경우 경고 메시지 추가
                    warnings.append(f"문장 길이가 256자를 초과합니다: {trimmed_subpart[:50]}... (길이: {len(trimmed_subpart)})")
        else:
            # 길이가 256자 이하인 경우 바로 결과 리스트에 추가
            result.append(part.strip())
    warnings = 0

    return result


def youtube_correction(link):
    ASR_result = youtubeASR(link)
    text_list = split_text_complex_rules_with_warning(ASR_result)
    whole_text = ''
    for text in text_list:
        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
        inputs = tokenizer.encode(
            input_text,
            return_tensors='pt',
            max_length=325,
            padding='max_length',
            truncation=True
        )
        # Get correct sentence ids.
        corrected_ids = model.generate(
            inputs,
            max_length=325,
            num_beams=5, # `num_beams=1` indicated temperature sampling.
            early_stopping=True
        )
        # Decode.
        corrected_sentence = tokenizer.decode(
            corrected_ids[0],
            skip_special_tokens=False
        )
        whole_text += corrected_sentence

    return remove_spaces_within_dollar(whole_text)[5:-4]


demo = gr.Blocks()

file_transcribe = gr.Interface(
    fn=audio_correction,
    inputs=gr.components.Audio(sources="upload", type="filepath"),
    outputs="text"
    )

yt_transcribe = gr.Interface(
    fn=youtube_correction,
    inputs="text",
    outputs="text"
    )

with demo:
    gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])

demo.launch()