IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Jul 7, 2024

Commit

a7747d2

verified ·

1 Parent(s): 7b4aa5f

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -92

app.py CHANGED Viewed

@@ -545,92 +545,9 @@ def generate_audio_parler_tts(text):
     logging.debug(f"Audio saved to {combined_audio_path}")
     return combined_audio_path
-# # Load the MARS5 model
-# mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
-# def generate_audio_mars5(text):
-#     description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
-#     kwargs_dict = {
-#         'temperature': 0.8,
-#         'top_k': -1,
-#         'top_p': 0.2,
-#         'typical_p': 1.0,
-#         'freq_penalty': 2.6,
-#         'presence_penalty': 0.4,
-#         'rep_penalty_window': 100,
-#         'max_prompt_phones': 360,
-#         'deep_clone': True,
-#         'nar_guidance_w': 3
-#     }
-#     chunks = chunk_text(preprocess(text))
-#     audio_segments = []
-#     for chunk in chunks:
-#         wav = torch.zeros(1, mars5.sr)  # Use a placeholder silent audio for the reference
-#         cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
-#         ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
-#         temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
-#         torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
-#         audio_segments.append(AudioSegment.from_wav(temp_audio_path))
-#     combined_audio = sum(audio_segments)
-#     combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
-#     combined_audio.export(combined_audio_path, format="wav")
-#     logging.debug(f"Audio saved to {combined_audio_path}")
-#     return combined_audio_path
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
-# Setting device and precision
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-mars5.to(device)
-SAMPLE_RATE = 22050
-SEED = 42
-def preprocess(text):
-    number_normalizer = EnglishNumberNormalizer()
-    text = number_normalizer(text).strip()
-    if text[-1] not in punctuation:
-        text = f"{text}."
-    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
-    def separate_abb(chunk):
-        chunk = chunk.replace(".", "")
-        return " ".join(chunk)
-    abbreviations = re.findall(abbreviations_pattern, text)
-    for abv in abbreviations:
-        if abv in text:
-            text = text.replace(abv, separate_abb(abv))
-    return text
-def chunk_text(text, max_length=250):
-    words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for word in words:
-        if current_length + len(word) + 1 <= max_length:
-            current_chunk.append(word)
-            current_length += len(word) + 1
-        else:
-            chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = len(word) + 1
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    return chunks
 def generate_audio_mars5(text):
     description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
     kwargs_dict = {
@@ -645,24 +562,20 @@ def generate_audio_mars5(text):
         'deep_clone': True,
         'nar_guidance_w': 3
     }
     chunks = chunk_text(preprocess(text))
     audio_segments = []
-    def process_chunk(chunk):
         wav = torch.zeros(1, mars5.sr)  # Use a placeholder silent audio for the reference
         cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
         ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
         temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
         torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
-        return AudioSegment.from_wav(temp_audio_path)
-    # Use concurrent futures for parallel processing
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        results = list(executor.map(process_chunk, chunks))
-        audio_segments.extend(results)
     combined_audio = sum(audio_segments)
     combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
     combined_audio.export(combined_audio_path, format="wav")
@@ -671,6 +584,7 @@ def generate_audio_mars5(text):
     return combined_audio_path
 pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
 pipe.to(device)

     logging.debug(f"Audio saved to {combined_audio_path}")
     return combined_audio_path
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
 def generate_audio_mars5(text):
     description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
     kwargs_dict = {
         'deep_clone': True,
         'nar_guidance_w': 3
     }
     chunks = chunk_text(preprocess(text))
     audio_segments = []
+    for chunk in chunks:
         wav = torch.zeros(1, mars5.sr)  # Use a placeholder silent audio for the reference
         cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
         ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
         temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
         torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
+        audio_segments.append(AudioSegment.from_wav(temp_audio_path))
     combined_audio = sum(audio_segments)
     combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
     combined_audio.export(combined_audio_path, format="wav")
     return combined_audio_path
 pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
 pipe.to(device)