Lyte commited on
Commit
4c8266a
·
verified ·
1 Parent(s): 26e5605

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import whisper
4
+ import outetts
5
+ import numpy as np
6
+ from huggingface_hub import hf_hub_download
7
+ from outetts.wav_tokenizer.audio_codec import AudioCodec
8
+ from outetts.version.v2.prompt_processor import PromptProcessor
9
+ from outetts.version.playback import ModelOutput
10
+
11
+ model_path = hf_hub_download(
12
+ repo_id="Lyte/CiSiMi",
13
+ filename="unsloth.Q8_0.gguf",
14
+ )
15
+
16
+ model_config = outetts.GGUFModelConfig_v2(
17
+ model_path=model_path,
18
+ tokenizer_path="Lyte/CiSiMi",
19
+ )
20
+
21
+ interface = outetts.InterfaceGGUF(model_version="0.3", cfg=model_config)
22
+ audio_codec = AudioCodec()
23
+ prompt_processor = PromptProcessor("Lyte/Qwen-2.5-0.5B-S2S-test")
24
+ whisper_model = whisper.load_model("base.en")
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ gguf_model = interface.get_model()
27
+
28
+ def get_audio(tokens):
29
+ outputs = prompt_processor.extract_audio_from_tokens(tokens)
30
+ if not outputs:
31
+ return None
32
+ audio_tensor = audio_codec.decode(torch.tensor([[outputs]], dtype=torch.int64).to(device))
33
+ return ModelOutput(audio_tensor, audio_codec.sr)
34
+
35
+ def extract_text_from_tts_output(tts_output):
36
+ text = ""
37
+ for line in tts_output.strip().split('\n'):
38
+ if '<|audio_end|>' in line or '<|im_end|>' in line:
39
+ continue
40
+ if '<|' in line:
41
+ word = line.split('<|')[0].strip()
42
+ if word:
43
+ text += word + " "
44
+ else:
45
+ text += line.strip() + " "
46
+ return text.strip()
47
+
48
+ def process_input(audio_input, text_input):
49
+ if audio_input is None and (text_input is None or text_input.strip() == ""):
50
+ return "Please provide either audio or text input.", None
51
+
52
+ if audio_input is not None:
53
+ return process_audio(audio_input)
54
+ else:
55
+ return process_text(text_input)
56
+
57
+ def process_audio(audio):
58
+ result = whisper_model.transcribe(audio)
59
+ instruction = result["text"]
60
+ return generate_response(instruction)
61
+
62
+ def process_text(text):
63
+ instruction = text
64
+ return generate_response(instruction)
65
+
66
+ def generate_response(instruction):
67
+ prompt = f"<|im_start|>\nInstructions:\n{instruction}\n<|im_end|>\nAnswer:\n"
68
+ gen_cfg = outetts.GenerationConfig(
69
+ text=prompt,
70
+ temperature=0.6,
71
+ repetition_penalty=1.1,
72
+ max_length=4096,
73
+ speaker=None
74
+ )
75
+
76
+ input_ids = prompt_processor.tokenizer.encode(prompt)
77
+ tokens = gguf_model.generate(input_ids, gen_cfg)
78
+
79
+ output_text = prompt_processor.tokenizer.decode(tokens, skip_special_tokens=False)
80
+
81
+ if "<|audio_end|>" in output_text:
82
+ first_part, _, _ = output_text.partition("<|audio_end|>")
83
+
84
+ if "<|audio_end|>\n<|im_end|>\n" not in first_part:
85
+ first_part += "<|audio_end|>\n<|im_end|>\n"
86
+
87
+ extracted_text = extract_text_from_tts_output(first_part)
88
+
89
+ audio_start_pos = first_part.find("<|audio_start|>\n") + len("<|audio_start|>\n")
90
+ audio_end_pos = first_part.find("<|audio_end|>\n<|im_end|>\n") + len("<|audio_end|>\n<|im_end|>\n")
91
+
92
+ if audio_start_pos >= len("<|audio_start|>\n") and audio_end_pos > audio_start_pos:
93
+ audio_tokens_text = first_part[audio_start_pos:audio_end_pos]
94
+
95
+ audio_tokens = prompt_processor.tokenizer.encode(audio_tokens_text)
96
+
97
+ #print(f"Decoding audio with {len(audio_tokens)} tokens")
98
+ #print(f"audio_tokens: {audio_tokens_text}")
99
+
100
+ audio_output = get_audio(audio_tokens)
101
+
102
+ if audio_output is not None and hasattr(audio_output, 'audio') and audio_output.audio is not None:
103
+ audio_numpy = audio_output.audio.cpu().numpy()
104
+ if audio_numpy.ndim > 1:
105
+ audio_numpy = audio_numpy.squeeze()
106
+
107
+ #display(Audio(data=audio_numpy, rate=audio_output.sr, autoplay=True))
108
+ return extracted_text, (audio_output.sr, audio_numpy)
109
+
110
+ return output_text, None
111
+
112
+ iface = gr.Interface(
113
+ fn=process_input,
114
+ inputs=[
115
+ gr.Audio(type="filepath", label="Audio Input (Optional)"),
116
+ gr.Textbox(label="Text Input (Optional)")
117
+ ],
118
+ outputs=[
119
+ gr.Textbox(label="Response Text"),
120
+ gr.Audio(type="numpy", label="Generated Speech")
121
+ ],
122
+ title="CiSiMi @ Home Demo",
123
+ description="Me: Mom can we have CSM locally! Mom: we have CSM locally. CSM locally:",
124
+ examples=[
125
+ [None, "Hello, what are you capable of?"],
126
+ [None, "Explain to me how gravity works!"]
127
+ ]
128
+ )
129
+
130
+ iface.launch(debug=True)