Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -638,6 +638,14 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory | |
| 638 | 
             
            from langchain.agents import Tool, initialize_agent
         | 
| 639 | 
             
            from huggingface_hub import login
         | 
| 640 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 641 | 
             
            # Check if the token is already set in the environment variables
         | 
| 642 | 
             
            hf_token = os.getenv("HF_TOKEN")
         | 
| 643 |  | 
| @@ -952,7 +960,7 @@ def fetch_local_news(): | |
| 952 | 
             
                api_key = os.environ['SERP_API']
         | 
| 953 | 
             
                url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
         | 
| 954 | 
             
                response = requests.get(url)
         | 
| 955 | 
            -
                if response.status_code == 200:
         | 
| 956 | 
             
                    results = response.json().get("news_results", [])
         | 
| 957 | 
             
                    news_html = """
         | 
| 958 | 
             
                    <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
         | 
| @@ -1111,44 +1119,126 @@ def generate_audio_elevenlabs(text): | |
| 1111 | 
             
                    return None
         | 
| 1112 |  | 
| 1113 | 
             
            # Changes start here
         | 
| 1114 | 
            -
             | 
| 1115 | 
            -
            def  | 
| 1116 | 
            -
             | 
| 1117 | 
            -
             | 
| 1118 | 
            -
             | 
| 1119 | 
            -
             | 
| 1120 | 
            -
             | 
| 1121 | 
            -
             | 
| 1122 | 
            -
             | 
| 1123 | 
            -
             | 
| 1124 | 
            -
             | 
| 1125 | 
            -
             | 
| 1126 | 
            -
                     | 
| 1127 | 
            -
             | 
| 1128 | 
            -
             | 
| 1129 | 
            -
             | 
| 1130 | 
            -
             | 
| 1131 | 
            -
             | 
| 1132 | 
            -
             | 
| 1133 | 
            -
             | 
| 1134 | 
            -
             | 
| 1135 | 
            -
             | 
| 1136 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1137 |  | 
| 1138 | 
            -
             | 
| 1139 | 
            -
             | 
|  | |
| 1140 |  | 
| 1141 | 
            -
             | 
| 1142 | 
            -
                audio_arr = generation.cpu().numpy().squeeze()
         | 
| 1143 |  | 
| 1144 | 
            -
             | 
| 1145 | 
            -
             | 
| 1146 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1147 |  | 
| 1148 | 
            -
             | 
| 1149 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1150 |  | 
| 1151 | 
            -
            # Function to generate audio using Parler TTS
         | 
| 1152 | 
             
            def generate_audio_parler_tts(text):
         | 
| 1153 | 
             
                model_id = 'parler-tts/parler_tts_mini_v0.1'
         | 
| 1154 | 
             
                device = "cuda:0" if torch.cuda.is_available() else "cpu"
         | 
| @@ -1161,28 +1251,38 @@ def generate_audio_parler_tts(text): | |
| 1161 | 
             
                    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
         | 
| 1162 |  | 
| 1163 | 
             
                tokenizer = AutoTokenizer.from_pretrained(model_id)
         | 
|  | |
|  | |
| 1164 |  | 
| 1165 | 
            -
                 | 
| 1166 | 
            -
                 | 
|  | |
| 1167 |  | 
| 1168 | 
            -
                 | 
| 1169 | 
            -
                    future_to_chunk = {executor.submit(process_chunk, chunk, model, tokenizer, device): chunk for chunk in text_chunks}
         | 
| 1170 |  | 
| 1171 | 
            -
             | 
| 1172 | 
            -
             | 
| 1173 | 
            -
             | 
| 1174 | 
            -
             | 
| 1175 | 
            -
             | 
| 1176 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1177 |  | 
| 1178 | 
             
                combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
         | 
| 1179 | 
             
                combined_audio = []
         | 
| 1180 |  | 
| 1181 | 
            -
                for  | 
| 1182 | 
            -
                     | 
| 1183 | 
            -
                    combined_audio.extend( | 
| 1184 |  | 
| 1185 | 
            -
                sf.write(combined_audio_path, combined_audio,  | 
| 1186 |  | 
| 1187 | 
             
                logging.debug(f"Combined audio saved to {combined_audio_path}")
         | 
| 1188 | 
             
                return combined_audio_path
         | 
| @@ -1256,3 +1356,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo: | |
| 1256 |  | 
| 1257 | 
             
            demo.queue()
         | 
| 1258 | 
             
            demo.launch(share=True)
         | 
|  | 
|  | |
| 638 | 
             
            from langchain.agents import Tool, initialize_agent
         | 
| 639 | 
             
            from huggingface_hub import login
         | 
| 640 |  | 
| 641 | 
            +
            from pydub import AudioSegment
         | 
| 642 | 
            +
            import io
         | 
| 643 | 
            +
            import math
         | 
| 644 | 
            +
            from threading import Thread
         | 
| 645 | 
            +
            from queue import Queue
         | 
| 646 | 
            +
            from transformers.generation.streamers import BaseStreamer
         | 
| 647 | 
            +
            import numpy as np
         | 
| 648 | 
            +
             | 
| 649 | 
             
            # Check if the token is already set in the environment variables
         | 
| 650 | 
             
            hf_token = os.getenv("HF_TOKEN")
         | 
| 651 |  | 
|  | |
| 960 | 
             
                api_key = os.environ['SERP_API']
         | 
| 961 | 
             
                url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
         | 
| 962 | 
             
                response = requests.get(url)
         | 
| 963 | 
            +
                if response.status_code == 200):
         | 
| 964 | 
             
                    results = response.json().get("news_results", [])
         | 
| 965 | 
             
                    news_html = """
         | 
| 966 | 
             
                    <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
         | 
|  | |
| 1119 | 
             
                    return None
         | 
| 1120 |  | 
| 1121 | 
             
            # Changes start here
         | 
| 1122 | 
            +
            class ParlerTTSStreamer(BaseStreamer):
         | 
| 1123 | 
            +
                def __init__(
         | 
| 1124 | 
            +
                    self,
         | 
| 1125 | 
            +
                    model: ParlerTTSForConditionalGeneration,
         | 
| 1126 | 
            +
                    device: Optional[str] = None,
         | 
| 1127 | 
            +
                    play_steps: Optional[int] = 10,
         | 
| 1128 | 
            +
                    stride: Optional[int] = None,
         | 
| 1129 | 
            +
                    timeout: Optional[float] = None,
         | 
| 1130 | 
            +
                ):
         | 
| 1131 | 
            +
                    self.decoder = model.decoder
         | 
| 1132 | 
            +
                    self.audio_encoder = model.audio_encoder
         | 
| 1133 | 
            +
                    self.generation_config = model.generation_config
         | 
| 1134 | 
            +
                    self.device = device if device is not None else model.device
         | 
| 1135 | 
            +
             | 
| 1136 | 
            +
                    self.play_steps = play_steps
         | 
| 1137 | 
            +
                    if stride is not None:
         | 
| 1138 | 
            +
                        self.stride = stride
         | 
| 1139 | 
            +
                    else:
         | 
| 1140 | 
            +
                        hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
         | 
| 1141 | 
            +
                        self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
         | 
| 1142 | 
            +
                    self.token_cache = None
         | 
| 1143 | 
            +
                    self.to_yield = 0
         | 
| 1144 | 
            +
             | 
| 1145 | 
            +
                    self.audio_queue = Queue()
         | 
| 1146 | 
            +
                    self.stop_signal = None
         | 
| 1147 | 
            +
                    self.timeout = timeout
         | 
| 1148 | 
            +
             | 
| 1149 | 
            +
                def apply_delay_pattern_mask(self, input_ids):
         | 
| 1150 | 
            +
                    _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
         | 
| 1151 | 
            +
                        input_ids[:, :1],
         | 
| 1152 | 
            +
                        bos_token_id=self.generation_config.bos_token_id,
         | 
| 1153 | 
            +
                        pad_token_id=self.generation_config.decoder_start_token_id,
         | 
| 1154 | 
            +
                        max_length=input_ids.shape[-1],
         | 
| 1155 | 
            +
                    )
         | 
| 1156 | 
            +
                    input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
         | 
| 1157 |  | 
| 1158 | 
            +
                    mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
         | 
| 1159 | 
            +
                    input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
         | 
| 1160 | 
            +
                    input_ids = input_ids[None, ...]
         | 
| 1161 |  | 
| 1162 | 
            +
                    input_ids = input_ids.to(self.audio_encoder.device)
         | 
|  | |
| 1163 |  | 
| 1164 | 
            +
                    decode_sequentially = (
         | 
| 1165 | 
            +
                        self.generation_config.bos_token_id in input_ids
         | 
| 1166 | 
            +
                        or self.generation_config.pad_token_id in input_ids
         | 
| 1167 | 
            +
                        or self.generation_config.eos_token_id in input_ids
         | 
| 1168 | 
            +
                    )
         | 
| 1169 | 
            +
                    if not decode_sequentially:
         | 
| 1170 | 
            +
                        output_values = self.audio_encoder.decode(
         | 
| 1171 | 
            +
                            input_ids,
         | 
| 1172 | 
            +
                            audio_scales=[None],
         | 
| 1173 | 
            +
                        )
         | 
| 1174 | 
            +
                    else:
         | 
| 1175 | 
            +
                        sample = input_ids[:, 0]
         | 
| 1176 | 
            +
                        sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
         | 
| 1177 | 
            +
                        sample = sample[:, :, sample_mask]
         | 
| 1178 | 
            +
                        output_values = self.audio_encoder.decode(sample[None, ...], [None])
         | 
| 1179 | 
            +
             | 
| 1180 | 
            +
                    audio_values = output_values.audio_values[0, 0]
         | 
| 1181 | 
            +
                    return audio_values.cpu().float().numpy()
         | 
| 1182 | 
            +
             | 
| 1183 | 
            +
                def put(self, value):
         | 
| 1184 | 
            +
                    batch_size = value.shape[0] // self.decoder.num_codebooks
         | 
| 1185 | 
            +
                    if batch_size > 1:
         | 
| 1186 | 
            +
                        raise ValueError("ParlerTTSStreamer only supports batch size 1")
         | 
| 1187 | 
            +
             | 
| 1188 | 
            +
                    if self.token_cache is None:
         | 
| 1189 | 
            +
                        self.token_cache = value
         | 
| 1190 | 
            +
                    else:
         | 
| 1191 | 
            +
                        self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
         | 
| 1192 | 
            +
             | 
| 1193 | 
            +
                    if self.token_cache.shape[-1] % self.play_steps == 0:
         | 
| 1194 | 
            +
                        audio_values = self.apply_delay_pattern_mask(self.token_cache)
         | 
| 1195 | 
            +
                        self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
         | 
| 1196 | 
            +
                        self.to_yield += len(audio_values) - self.to_yield - self.stride
         | 
| 1197 | 
            +
             | 
| 1198 | 
            +
                def end(self):
         | 
| 1199 | 
            +
                    if self.token_cache is not None:
         | 
| 1200 | 
            +
                        audio_values = self.apply_delay_pattern_mask(self.token_cache)
         | 
| 1201 | 
            +
                    else:
         | 
| 1202 | 
            +
                        audio_values = np.zeros(self.to_yield)
         | 
| 1203 |  | 
| 1204 | 
            +
                    self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
         | 
| 1205 | 
            +
             | 
| 1206 | 
            +
                def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
         | 
| 1207 | 
            +
                    self.audio_queue.put(audio, timeout=self.timeout)
         | 
| 1208 | 
            +
                    if stream_end:
         | 
| 1209 | 
            +
                        self.audio_queue.put(self.stop_signal, timeout=self.timeout)
         | 
| 1210 | 
            +
             | 
| 1211 | 
            +
                def __iter__(self):
         | 
| 1212 | 
            +
                    return self
         | 
| 1213 | 
            +
             | 
| 1214 | 
            +
                def __next__(self):
         | 
| 1215 | 
            +
                    value = self.audio_queue.get(timeout=self.timeout)
         | 
| 1216 | 
            +
                    if not isinstance(value, np.ndarray) and value == self.stop_signal:
         | 
| 1217 | 
            +
                        raise StopIteration()
         | 
| 1218 | 
            +
                    else:
         | 
| 1219 | 
            +
                        return value
         | 
| 1220 | 
            +
             | 
| 1221 | 
            +
            def numpy_to_mp3(audio_array, sampling_rate):
         | 
| 1222 | 
            +
                if np.issubdtype(audio_array.dtype, np.floating):
         | 
| 1223 | 
            +
                    max_val = np.max(np.abs(audio_array))
         | 
| 1224 | 
            +
                    audio_array = (audio_array / max_val) * 32767
         | 
| 1225 | 
            +
                    audio_array = audio_array.astype(np.int16)
         | 
| 1226 | 
            +
             | 
| 1227 | 
            +
                audio_segment = AudioSegment(
         | 
| 1228 | 
            +
                    audio_array.tobytes(),
         | 
| 1229 | 
            +
                    frame_rate=sampling_rate,
         | 
| 1230 | 
            +
                    sample_width=audio_array.dtype.itemsize,
         | 
| 1231 | 
            +
                    channels=1
         | 
| 1232 | 
            +
                )
         | 
| 1233 | 
            +
             | 
| 1234 | 
            +
                mp3_io = io.BytesIO()
         | 
| 1235 | 
            +
                audio_segment.export(mp3_io, format="mp3", bitrate="320k")
         | 
| 1236 | 
            +
             | 
| 1237 | 
            +
                mp3_bytes = mp3_io.getvalue()
         | 
| 1238 | 
            +
                mp3_io.close()
         | 
| 1239 | 
            +
             | 
| 1240 | 
            +
                return mp3_bytes
         | 
| 1241 |  | 
|  | |
| 1242 | 
             
            def generate_audio_parler_tts(text):
         | 
| 1243 | 
             
                model_id = 'parler-tts/parler_tts_mini_v0.1'
         | 
| 1244 | 
             
                device = "cuda:0" if torch.cuda.is_available() else "cpu"
         | 
|  | |
| 1251 | 
             
                    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
         | 
| 1252 |  | 
| 1253 | 
             
                tokenizer = AutoTokenizer.from_pretrained(model_id)
         | 
| 1254 | 
            +
                sampling_rate = model.audio_encoder.config.sampling_rate
         | 
| 1255 | 
            +
                frame_rate = model.audio_encoder.config.frame_rate
         | 
| 1256 |  | 
| 1257 | 
            +
                description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
         | 
| 1258 | 
            +
                play_steps_in_s = 2.0
         | 
| 1259 | 
            +
                play_steps = int(frame_rate * play_steps_in_s)
         | 
| 1260 |  | 
| 1261 | 
            +
                streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
         | 
|  | |
| 1262 |  | 
| 1263 | 
            +
                inputs = tokenizer(description, return_tensors="pt").to(device)
         | 
| 1264 | 
            +
                prompt = tokenizer(text, return_tensors="pt").to(device)
         | 
| 1265 | 
            +
             | 
| 1266 | 
            +
                generation_kwargs = dict(
         | 
| 1267 | 
            +
                    input_ids=inputs.input_ids,
         | 
| 1268 | 
            +
                    prompt_input_ids=prompt.input_ids,
         | 
| 1269 | 
            +
                    streamer=streamer,
         | 
| 1270 | 
            +
                    do_sample=True,
         | 
| 1271 | 
            +
                    temperature=1.0,
         | 
| 1272 | 
            +
                    min_new_tokens=10,
         | 
| 1273 | 
            +
                )
         | 
| 1274 | 
            +
             | 
| 1275 | 
            +
                thread = Thread(target=model.generate, kwargs=generation_kwargs)
         | 
| 1276 | 
            +
                thread.start()
         | 
| 1277 |  | 
| 1278 | 
             
                combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
         | 
| 1279 | 
             
                combined_audio = []
         | 
| 1280 |  | 
| 1281 | 
            +
                for new_audio in streamer:
         | 
| 1282 | 
            +
                    print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
         | 
| 1283 | 
            +
                    combined_audio.extend(new_audio)
         | 
| 1284 |  | 
| 1285 | 
            +
                sf.write(combined_audio_path, combined_audio, sampling_rate)
         | 
| 1286 |  | 
| 1287 | 
             
                logging.debug(f"Combined audio saved to {combined_audio_path}")
         | 
| 1288 | 
             
                return combined_audio_path
         | 
|  | |
| 1356 |  | 
| 1357 | 
             
            demo.queue()
         | 
| 1358 | 
             
            demo.launch(share=True)
         | 
| 1359 | 
            +
             | 
 
			
