Spaces:

intelli-zen
/

asr

Running

HoneyTian commited on May 11, 2024

Commit

dcf6c65

1 Parent(s): 085d464

update

Files changed (1) hide show

main.py CHANGED Viewed

@@ -9,6 +9,7 @@ import logging
 from pathlib import Path
 import platform
 import time
 from project_settings import project_path, log_directory
 import log
@@ -77,6 +78,15 @@ def process(
     main_logger.info("num_active_paths: {}".format(num_active_paths))
     main_logger.info("in_filename: {}".format(in_filename))
     m_list = models.model_map.get(language)
     if m_list is None:
         raise AssertionError("language invalid: {}".format(language))
@@ -88,11 +98,8 @@ def process(
     if m_dict is None:
         raise AssertionError("repo_id invalid: {}".format(repo_id))
     local_model_dir = pretrained_model_dir / "huggingface" / repo_id
-    out_filename = io.BytesIO()
-    audio_convert(in_filename, out_filename)
     nn_model_file = local_model_dir / m_dict["nn_model_file"]
     tokens_file = local_model_dir / m_dict["tokens_file"]
@@ -107,6 +114,7 @@ def process(
         num_active_paths=num_active_paths,
     )
     now = datetime.now()
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     logging.info(f"Started at {date_time}")
@@ -119,6 +127,7 @@ def process(
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
     metadata = torchaudio.info(out_filename)
     duration = metadata.num_frames / 16000
     rtf = (end - start) / duration

 from pathlib import Path
 import platform
 import time
+import tempfile
 from project_settings import project_path, log_directory
 import log
     main_logger.info("num_active_paths: {}".format(num_active_paths))
     main_logger.info("in_filename: {}".format(in_filename))
+    # audio convert
+    in_filename = Path(in_filename)
+    out_filename = Path(tempfile.gettempdir()) / "asr" / in_filename.name
+    audio_convert(in_filename=in_filename.as_posix(),
+                  out_filename=out_filename.as_posix(),
+                  )
+    # model settings
     m_list = models.model_map.get(language)
     if m_list is None:
         raise AssertionError("language invalid: {}".format(language))
     if m_dict is None:
         raise AssertionError("repo_id invalid: {}".format(repo_id))
+    # load recognizer
     local_model_dir = pretrained_model_dir / "huggingface" / repo_id
     nn_model_file = local_model_dir / m_dict["nn_model_file"]
     tokens_file = local_model_dir / m_dict["tokens_file"]
         num_active_paths=num_active_paths,
     )
+    # transcribe
     now = datetime.now()
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     logging.info(f"Started at {date_time}")
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
+    # statistics
     metadata = torchaudio.info(out_filename)
     duration = metadata.num_frames / 16000
     rtf = (end - start) / duration