import json
from tqdm import tqdm
import os
import re
import pandas as pd
from tqdm import tqdm
import logging
import warnings
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
warnings.filterwarnings('ignore')
logging.getLogger('modelscope').setLevel(logging.CRITICAL)

inference_pipeline = pipeline(
            task=Tasks.auto_speech_recognition,
            model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
            model_revision="v1.2.4"
        )
# inference_pipeline = pipeline(
# task=Tasks.auto_speech_recognition,
# model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
# )

# out_path = 'ttts/datasets/all_data.jsonl'
def process_file_asr(paths):
    file_path, out_path = paths
    data = pd.DataFrame(columns=['file_path', 'text'])
    
    try:
        text = inference_pipeline(audio_in= file_path)['text']
        if len(text) >= 5:
            my_re = re.compile(r'[A-Za-z]', re.S)
            res = re.findall(my_re, text)
            if len(res): 
                #不符合就删除，否则后面也会生成bert文件
                pass
                # os.remove(os.path.join(input_directory, file))
            else:
                # 将数据添加到DataFrame中
                print(f'{file_path} ASR结果：{text}')
                with open(out_path, 'a', encoding='utf-8') as file:
                    json.dump({'text':text,'path':file_path}, file, ensure_ascii=False)
                    file.write('\n')
                return file_path, text
                # data=data.append({'file_path': os.path.join(input_directory, file),  'text': text}, ignore_index=True)
        else:
            pass
            # os.remove(os.path.join(input_directory, file))
        # print(f'{file_path} ASR结果：{text}')
    except Exception :
        print(f"ASR异常，错误样本:{file_path}")
        return None