Raven-with-Voice-Cloning

Runtime error

App Files Files Community

Kevin676 commited on Apr 8, 2023

Commit

a69ae8e

1 Parent(s): 002b9bb

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -7

app.py CHANGED Viewed

@@ -17,6 +17,24 @@ model = RWKV(model=model_path, strategy='cuda fp16i8 *8 -> cuda fp16')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
@@ -39,7 +57,9 @@ def generate_prompt(instruction, input=None):
 """
 def evaluate(
-    instruction,
 #    input=None,
 #    token_count=200,
 #    temperature=1.0,
@@ -47,13 +67,30 @@ def evaluate(
 #    presencePenalty = 0.1,
 #    countPenalty = 0.1,
 ):
     args = PIPELINE_ARGS(temperature = max(0.2, float(1)), top_p = float(0.5),
                      alpha_frequency = 0.4,
                      alpha_presence = 0.4,
                      token_ban = [], # ban the generation of some tokens
                      token_stop = [0]) # stop generation whenever you see any token here
-    instruction = instruction.strip()
     input=None
 #    input = input.strip()
     ctx = generate_prompt(instruction, input)
@@ -87,12 +124,33 @@ def evaluate(
             out_last = i + 1
     gc.collect()
     torch.cuda.empty_cache()
-    yield out_str.strip()
 g = gr.Interface(
     fn=evaluate,
     inputs=[
-        gr.components.Textbox(lines=2, label="Instruction", value="Tell me about ravens."),
 #        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
 #        gr.components.Slider(minimum=10, maximum=200, step=10, value=150), # token_count
 #        gr.components.Slider(minimum=0.2, maximum=2.0, step=0.1, value=1.0), # temperature
@@ -101,9 +159,9 @@ g = gr.Interface(
 #        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # countPenalty
     ],
     outputs=[
-        gr.inputs.Textbox(
-            lines=5,
-            label="Output",
         )
     ],
     title="🥳💬💕 - TalktoAI，随时随地，谈天说地！",

 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
+from TTS.api import TTS
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+import whisper
+model = whisper.load_model("small")
+os.system('pip install voicefixer --upgrade')
+from voicefixer import VoiceFixer
+voicefixer = VoiceFixer()
+import torchaudio
+from speechbrain.pretrained import SpectralMaskEnhancement
+enhance_model = SpectralMaskEnhancement.from_hparams(
+source="speechbrain/metricgan-plus-voicebank",
+savedir="pretrained_models/metricgan-plus-voicebank",
+run_opts={"device":"cuda"},
+)
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 """
 def evaluate(
+    upload,
+    audio,
+#    instruction,
 #    input=None,
 #    token_count=200,
 #    temperature=1.0,
 #    presencePenalty = 0.1,
 #    countPenalty = 0.1,
 ):
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+    # decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    res = []
     args = PIPELINE_ARGS(temperature = max(0.2, float(1)), top_p = float(0.5),
                      alpha_frequency = 0.4,
                      alpha_presence = 0.4,
                      token_ban = [], # ban the generation of some tokens
                      token_stop = [0]) # stop generation whenever you see any token here
+    instruction = result.text.strip()
     input=None
 #    input = input.strip()
     ctx = generate_prompt(instruction, input)
             out_last = i + 1
     gc.collect()
     torch.cuda.empty_cache()
+    res.append(out_str.strip())
+    tts.tts_to_file(res, speaker_wav = upload, language="en", file_path="output.wav")
+    voicefixer.restore(input="output.wav", # input wav file path
+                    output="audio1.wav", # output wav file path
+                    cuda=True, # whether to use gpu acceleration
+                    mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
+    noisy = enhance_model.load_audio(
+    "audio1.wav"
+    ).unsqueeze(0)
+    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
+    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
+    return [result.text, res, "enhanced.wav"]
+#    yield out_str.strip()
 g = gr.Interface(
     fn=evaluate,
     inputs=[
+        gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
+        gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
+#        gr.components.Textbox(lines=2, label="Instruction", value="Tell me about ravens."),
 #        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
 #        gr.components.Slider(minimum=10, maximum=200, step=10, value=150), # token_count
 #        gr.components.Slider(minimum=0.2, maximum=2.0, step=0.1, value=1.0), # temperature
 #        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # countPenalty
     ],
     outputs=[
+        gr.Textbox(label="Speech to Text"),
+        gr.Textbox(label="Raven Output"),
+        gr.Audio(label="Audio with Custom Voice"),
         )
     ],
     title="🥳💬💕 - TalktoAI，随时随地，谈天说地！",