Pijush2023 commited on
Commit
9cbe52f
·
verified ·
1 Parent(s): c91c0bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -48
app.py CHANGED
@@ -638,6 +638,14 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
638
  from langchain.agents import Tool, initialize_agent
639
  from huggingface_hub import login
640
 
 
 
 
 
 
 
 
 
641
  # Check if the token is already set in the environment variables
642
  hf_token = os.getenv("HF_TOKEN")
643
 
@@ -952,7 +960,7 @@ def fetch_local_news():
952
  api_key = os.environ['SERP_API']
953
  url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
954
  response = requests.get(url)
955
- if response.status_code == 200:
956
  results = response.json().get("news_results", [])
957
  news_html = """
958
  <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
@@ -1111,44 +1119,126 @@ def generate_audio_elevenlabs(text):
1111
  return None
1112
 
1113
  # Changes start here
1114
- # Function to chunk the text
1115
- def chunk_text(text, max_length=200):
1116
- words = text.split()
1117
- chunks = []
1118
- current_chunk = []
1119
- current_length = 0
1120
-
1121
- for word in words:
1122
- if current_length + len(word) + 1 > max_length:
1123
- chunks.append(" ".join(current_chunk))
1124
- current_chunk = []
1125
- current_length = 0
1126
- current_chunk.append(word)
1127
- current_length += len(word) + 1
1128
-
1129
- if current_chunk:
1130
- chunks.append(" ".join(current_chunk))
1131
-
1132
- return chunks
1133
-
1134
- # Function to process each chunk
1135
- def process_chunk(chunk, model, tokenizer, device):
1136
- description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
 
 
 
 
 
 
 
 
 
 
 
 
1137
 
1138
- input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
1139
- prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
 
1140
 
1141
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
1142
- audio_arr = generation.cpu().numpy().squeeze()
1143
 
1144
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
1145
- sf.write(f.name, audio_arr, model.config.sampling_rate)
1146
- temp_audio_path = f.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1147
 
1148
- logging.debug(f"Audio saved to {temp_audio_path}")
1149
- return temp_audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1150
 
1151
- # Function to generate audio using Parler TTS
1152
  def generate_audio_parler_tts(text):
1153
  model_id = 'parler-tts/parler_tts_mini_v0.1'
1154
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -1161,28 +1251,38 @@ def generate_audio_parler_tts(text):
1161
  model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
1162
 
1163
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
1164
 
1165
- text_chunks = chunk_text(text)
1166
- audio_paths = []
 
1167
 
1168
- with concurrent.futures.ThreadPoolExecutor() as executor:
1169
- future_to_chunk = {executor.submit(process_chunk, chunk, model, tokenizer, device): chunk for chunk in text_chunks}
1170
 
1171
- for future in concurrent.futures.as_completed(future_to_chunk):
1172
- try:
1173
- audio_path = future.result()
1174
- audio_paths.append(audio_path)
1175
- except Exception as e:
1176
- logging.error(f"Error processing chunk: {e}")
 
 
 
 
 
 
 
 
1177
 
1178
  combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
1179
  combined_audio = []
1180
 
1181
- for audio_path in audio_paths:
1182
- data, samplerate = sf.read(audio_path)
1183
- combined_audio.extend(data)
1184
 
1185
- sf.write(combined_audio_path, combined_audio, samplerate)
1186
 
1187
  logging.debug(f"Combined audio saved to {combined_audio_path}")
1188
  return combined_audio_path
@@ -1256,3 +1356,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1256
 
1257
  demo.queue()
1258
  demo.launch(share=True)
 
 
638
  from langchain.agents import Tool, initialize_agent
639
  from huggingface_hub import login
640
 
641
+ from pydub import AudioSegment
642
+ import io
643
+ import math
644
+ from threading import Thread
645
+ from queue import Queue
646
+ from transformers.generation.streamers import BaseStreamer
647
+ import numpy as np
648
+
649
  # Check if the token is already set in the environment variables
650
  hf_token = os.getenv("HF_TOKEN")
651
 
 
960
  api_key = os.environ['SERP_API']
961
  url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
962
  response = requests.get(url)
963
+ if response.status_code == 200):
964
  results = response.json().get("news_results", [])
965
  news_html = """
966
  <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
 
1119
  return None
1120
 
1121
  # Changes start here
1122
+ class ParlerTTSStreamer(BaseStreamer):
1123
+ def __init__(
1124
+ self,
1125
+ model: ParlerTTSForConditionalGeneration,
1126
+ device: Optional[str] = None,
1127
+ play_steps: Optional[int] = 10,
1128
+ stride: Optional[int] = None,
1129
+ timeout: Optional[float] = None,
1130
+ ):
1131
+ self.decoder = model.decoder
1132
+ self.audio_encoder = model.audio_encoder
1133
+ self.generation_config = model.generation_config
1134
+ self.device = device if device is not None else model.device
1135
+
1136
+ self.play_steps = play_steps
1137
+ if stride is not None:
1138
+ self.stride = stride
1139
+ else:
1140
+ hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
1141
+ self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
1142
+ self.token_cache = None
1143
+ self.to_yield = 0
1144
+
1145
+ self.audio_queue = Queue()
1146
+ self.stop_signal = None
1147
+ self.timeout = timeout
1148
+
1149
+ def apply_delay_pattern_mask(self, input_ids):
1150
+ _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
1151
+ input_ids[:, :1],
1152
+ bos_token_id=self.generation_config.bos_token_id,
1153
+ pad_token_id=self.generation_config.decoder_start_token_id,
1154
+ max_length=input_ids.shape[-1],
1155
+ )
1156
+ input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
1157
 
1158
+ mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
1159
+ input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
1160
+ input_ids = input_ids[None, ...]
1161
 
1162
+ input_ids = input_ids.to(self.audio_encoder.device)
 
1163
 
1164
+ decode_sequentially = (
1165
+ self.generation_config.bos_token_id in input_ids
1166
+ or self.generation_config.pad_token_id in input_ids
1167
+ or self.generation_config.eos_token_id in input_ids
1168
+ )
1169
+ if not decode_sequentially:
1170
+ output_values = self.audio_encoder.decode(
1171
+ input_ids,
1172
+ audio_scales=[None],
1173
+ )
1174
+ else:
1175
+ sample = input_ids[:, 0]
1176
+ sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
1177
+ sample = sample[:, :, sample_mask]
1178
+ output_values = self.audio_encoder.decode(sample[None, ...], [None])
1179
+
1180
+ audio_values = output_values.audio_values[0, 0]
1181
+ return audio_values.cpu().float().numpy()
1182
+
1183
+ def put(self, value):
1184
+ batch_size = value.shape[0] // self.decoder.num_codebooks
1185
+ if batch_size > 1:
1186
+ raise ValueError("ParlerTTSStreamer only supports batch size 1")
1187
+
1188
+ if self.token_cache is None:
1189
+ self.token_cache = value
1190
+ else:
1191
+ self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
1192
+
1193
+ if self.token_cache.shape[-1] % self.play_steps == 0:
1194
+ audio_values = self.apply_delay_pattern_mask(self.token_cache)
1195
+ self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
1196
+ self.to_yield += len(audio_values) - self.to_yield - self.stride
1197
+
1198
+ def end(self):
1199
+ if self.token_cache is not None:
1200
+ audio_values = self.apply_delay_pattern_mask(self.token_cache)
1201
+ else:
1202
+ audio_values = np.zeros(self.to_yield)
1203
 
1204
+ self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
1205
+
1206
+ def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
1207
+ self.audio_queue.put(audio, timeout=self.timeout)
1208
+ if stream_end:
1209
+ self.audio_queue.put(self.stop_signal, timeout=self.timeout)
1210
+
1211
+ def __iter__(self):
1212
+ return self
1213
+
1214
+ def __next__(self):
1215
+ value = self.audio_queue.get(timeout=self.timeout)
1216
+ if not isinstance(value, np.ndarray) and value == self.stop_signal:
1217
+ raise StopIteration()
1218
+ else:
1219
+ return value
1220
+
1221
+ def numpy_to_mp3(audio_array, sampling_rate):
1222
+ if np.issubdtype(audio_array.dtype, np.floating):
1223
+ max_val = np.max(np.abs(audio_array))
1224
+ audio_array = (audio_array / max_val) * 32767
1225
+ audio_array = audio_array.astype(np.int16)
1226
+
1227
+ audio_segment = AudioSegment(
1228
+ audio_array.tobytes(),
1229
+ frame_rate=sampling_rate,
1230
+ sample_width=audio_array.dtype.itemsize,
1231
+ channels=1
1232
+ )
1233
+
1234
+ mp3_io = io.BytesIO()
1235
+ audio_segment.export(mp3_io, format="mp3", bitrate="320k")
1236
+
1237
+ mp3_bytes = mp3_io.getvalue()
1238
+ mp3_io.close()
1239
+
1240
+ return mp3_bytes
1241
 
 
1242
  def generate_audio_parler_tts(text):
1243
  model_id = 'parler-tts/parler_tts_mini_v0.1'
1244
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
1251
  model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
1252
 
1253
  tokenizer = AutoTokenizer.from_pretrained(model_id)
1254
+ sampling_rate = model.audio_encoder.config.sampling_rate
1255
+ frame_rate = model.audio_encoder.config.frame_rate
1256
 
1257
+ description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
1258
+ play_steps_in_s = 2.0
1259
+ play_steps = int(frame_rate * play_steps_in_s)
1260
 
1261
+ streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
 
1262
 
1263
+ inputs = tokenizer(description, return_tensors="pt").to(device)
1264
+ prompt = tokenizer(text, return_tensors="pt").to(device)
1265
+
1266
+ generation_kwargs = dict(
1267
+ input_ids=inputs.input_ids,
1268
+ prompt_input_ids=prompt.input_ids,
1269
+ streamer=streamer,
1270
+ do_sample=True,
1271
+ temperature=1.0,
1272
+ min_new_tokens=10,
1273
+ )
1274
+
1275
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
1276
+ thread.start()
1277
 
1278
  combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
1279
  combined_audio = []
1280
 
1281
+ for new_audio in streamer:
1282
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
1283
+ combined_audio.extend(new_audio)
1284
 
1285
+ sf.write(combined_audio_path, combined_audio, sampling_rate)
1286
 
1287
  logging.debug(f"Combined audio saved to {combined_audio_path}")
1288
  return combined_audio_path
 
1356
 
1357
  demo.queue()
1358
  demo.launch(share=True)
1359
+