Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -638,6 +638,14 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
|
638 |
from langchain.agents import Tool, initialize_agent
|
639 |
from huggingface_hub import login
|
640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
# Check if the token is already set in the environment variables
|
642 |
hf_token = os.getenv("HF_TOKEN")
|
643 |
|
@@ -952,7 +960,7 @@ def fetch_local_news():
|
|
952 |
api_key = os.environ['SERP_API']
|
953 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
954 |
response = requests.get(url)
|
955 |
-
if response.status_code == 200:
|
956 |
results = response.json().get("news_results", [])
|
957 |
news_html = """
|
958 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
@@ -1111,44 +1119,126 @@ def generate_audio_elevenlabs(text):
|
|
1111 |
return None
|
1112 |
|
1113 |
# Changes start here
|
1114 |
-
|
1115 |
-
def
|
1116 |
-
|
1117 |
-
|
1118 |
-
|
1119 |
-
|
1120 |
-
|
1121 |
-
|
1122 |
-
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
-
|
1128 |
-
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
|
1135 |
-
|
1136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1137 |
|
1138 |
-
|
1139 |
-
|
|
|
1140 |
|
1141 |
-
|
1142 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
1143 |
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1147 |
|
1148 |
-
|
1149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1150 |
|
1151 |
-
# Function to generate audio using Parler TTS
|
1152 |
def generate_audio_parler_tts(text):
|
1153 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
1154 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -1161,28 +1251,38 @@ def generate_audio_parler_tts(text):
|
|
1161 |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
1162 |
|
1163 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
1164 |
|
1165 |
-
|
1166 |
-
|
|
|
1167 |
|
1168 |
-
|
1169 |
-
future_to_chunk = {executor.submit(process_chunk, chunk, model, tokenizer, device): chunk for chunk in text_chunks}
|
1170 |
|
1171 |
-
|
1172 |
-
|
1173 |
-
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1177 |
|
1178 |
combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
1179 |
combined_audio = []
|
1180 |
|
1181 |
-
for
|
1182 |
-
|
1183 |
-
combined_audio.extend(
|
1184 |
|
1185 |
-
sf.write(combined_audio_path, combined_audio,
|
1186 |
|
1187 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
1188 |
return combined_audio_path
|
@@ -1256,3 +1356,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1256 |
|
1257 |
demo.queue()
|
1258 |
demo.launch(share=True)
|
|
|
|
638 |
from langchain.agents import Tool, initialize_agent
|
639 |
from huggingface_hub import login
|
640 |
|
641 |
+
from pydub import AudioSegment
|
642 |
+
import io
|
643 |
+
import math
|
644 |
+
from threading import Thread
|
645 |
+
from queue import Queue
|
646 |
+
from transformers.generation.streamers import BaseStreamer
|
647 |
+
import numpy as np
|
648 |
+
|
649 |
# Check if the token is already set in the environment variables
|
650 |
hf_token = os.getenv("HF_TOKEN")
|
651 |
|
|
|
960 |
api_key = os.environ['SERP_API']
|
961 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
962 |
response = requests.get(url)
|
963 |
+
if response.status_code == 200):
|
964 |
results = response.json().get("news_results", [])
|
965 |
news_html = """
|
966 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
|
1119 |
return None
|
1120 |
|
1121 |
# Changes start here
|
1122 |
+
class ParlerTTSStreamer(BaseStreamer):
|
1123 |
+
def __init__(
|
1124 |
+
self,
|
1125 |
+
model: ParlerTTSForConditionalGeneration,
|
1126 |
+
device: Optional[str] = None,
|
1127 |
+
play_steps: Optional[int] = 10,
|
1128 |
+
stride: Optional[int] = None,
|
1129 |
+
timeout: Optional[float] = None,
|
1130 |
+
):
|
1131 |
+
self.decoder = model.decoder
|
1132 |
+
self.audio_encoder = model.audio_encoder
|
1133 |
+
self.generation_config = model.generation_config
|
1134 |
+
self.device = device if device is not None else model.device
|
1135 |
+
|
1136 |
+
self.play_steps = play_steps
|
1137 |
+
if stride is not None:
|
1138 |
+
self.stride = stride
|
1139 |
+
else:
|
1140 |
+
hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
|
1141 |
+
self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
|
1142 |
+
self.token_cache = None
|
1143 |
+
self.to_yield = 0
|
1144 |
+
|
1145 |
+
self.audio_queue = Queue()
|
1146 |
+
self.stop_signal = None
|
1147 |
+
self.timeout = timeout
|
1148 |
+
|
1149 |
+
def apply_delay_pattern_mask(self, input_ids):
|
1150 |
+
_, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
|
1151 |
+
input_ids[:, :1],
|
1152 |
+
bos_token_id=self.generation_config.bos_token_id,
|
1153 |
+
pad_token_id=self.generation_config.decoder_start_token_id,
|
1154 |
+
max_length=input_ids.shape[-1],
|
1155 |
+
)
|
1156 |
+
input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
|
1157 |
|
1158 |
+
mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
|
1159 |
+
input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
|
1160 |
+
input_ids = input_ids[None, ...]
|
1161 |
|
1162 |
+
input_ids = input_ids.to(self.audio_encoder.device)
|
|
|
1163 |
|
1164 |
+
decode_sequentially = (
|
1165 |
+
self.generation_config.bos_token_id in input_ids
|
1166 |
+
or self.generation_config.pad_token_id in input_ids
|
1167 |
+
or self.generation_config.eos_token_id in input_ids
|
1168 |
+
)
|
1169 |
+
if not decode_sequentially:
|
1170 |
+
output_values = self.audio_encoder.decode(
|
1171 |
+
input_ids,
|
1172 |
+
audio_scales=[None],
|
1173 |
+
)
|
1174 |
+
else:
|
1175 |
+
sample = input_ids[:, 0]
|
1176 |
+
sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
|
1177 |
+
sample = sample[:, :, sample_mask]
|
1178 |
+
output_values = self.audio_encoder.decode(sample[None, ...], [None])
|
1179 |
+
|
1180 |
+
audio_values = output_values.audio_values[0, 0]
|
1181 |
+
return audio_values.cpu().float().numpy()
|
1182 |
+
|
1183 |
+
def put(self, value):
|
1184 |
+
batch_size = value.shape[0] // self.decoder.num_codebooks
|
1185 |
+
if batch_size > 1:
|
1186 |
+
raise ValueError("ParlerTTSStreamer only supports batch size 1")
|
1187 |
+
|
1188 |
+
if self.token_cache is None:
|
1189 |
+
self.token_cache = value
|
1190 |
+
else:
|
1191 |
+
self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
|
1192 |
+
|
1193 |
+
if self.token_cache.shape[-1] % self.play_steps == 0:
|
1194 |
+
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
1195 |
+
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
1196 |
+
self.to_yield += len(audio_values) - self.to_yield - self.stride
|
1197 |
+
|
1198 |
+
def end(self):
|
1199 |
+
if self.token_cache is not None:
|
1200 |
+
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
1201 |
+
else:
|
1202 |
+
audio_values = np.zeros(self.to_yield)
|
1203 |
|
1204 |
+
self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
|
1205 |
+
|
1206 |
+
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
1207 |
+
self.audio_queue.put(audio, timeout=self.timeout)
|
1208 |
+
if stream_end:
|
1209 |
+
self.audio_queue.put(self.stop_signal, timeout=self.timeout)
|
1210 |
+
|
1211 |
+
def __iter__(self):
|
1212 |
+
return self
|
1213 |
+
|
1214 |
+
def __next__(self):
|
1215 |
+
value = self.audio_queue.get(timeout=self.timeout)
|
1216 |
+
if not isinstance(value, np.ndarray) and value == self.stop_signal:
|
1217 |
+
raise StopIteration()
|
1218 |
+
else:
|
1219 |
+
return value
|
1220 |
+
|
1221 |
+
def numpy_to_mp3(audio_array, sampling_rate):
|
1222 |
+
if np.issubdtype(audio_array.dtype, np.floating):
|
1223 |
+
max_val = np.max(np.abs(audio_array))
|
1224 |
+
audio_array = (audio_array / max_val) * 32767
|
1225 |
+
audio_array = audio_array.astype(np.int16)
|
1226 |
+
|
1227 |
+
audio_segment = AudioSegment(
|
1228 |
+
audio_array.tobytes(),
|
1229 |
+
frame_rate=sampling_rate,
|
1230 |
+
sample_width=audio_array.dtype.itemsize,
|
1231 |
+
channels=1
|
1232 |
+
)
|
1233 |
+
|
1234 |
+
mp3_io = io.BytesIO()
|
1235 |
+
audio_segment.export(mp3_io, format="mp3", bitrate="320k")
|
1236 |
+
|
1237 |
+
mp3_bytes = mp3_io.getvalue()
|
1238 |
+
mp3_io.close()
|
1239 |
+
|
1240 |
+
return mp3_bytes
|
1241 |
|
|
|
1242 |
def generate_audio_parler_tts(text):
|
1243 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
1244 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
1251 |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
1252 |
|
1253 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
1254 |
+
sampling_rate = model.audio_encoder.config.sampling_rate
|
1255 |
+
frame_rate = model.audio_encoder.config.frame_rate
|
1256 |
|
1257 |
+
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
1258 |
+
play_steps_in_s = 2.0
|
1259 |
+
play_steps = int(frame_rate * play_steps_in_s)
|
1260 |
|
1261 |
+
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
|
|
1262 |
|
1263 |
+
inputs = tokenizer(description, return_tensors="pt").to(device)
|
1264 |
+
prompt = tokenizer(text, return_tensors="pt").to(device)
|
1265 |
+
|
1266 |
+
generation_kwargs = dict(
|
1267 |
+
input_ids=inputs.input_ids,
|
1268 |
+
prompt_input_ids=prompt.input_ids,
|
1269 |
+
streamer=streamer,
|
1270 |
+
do_sample=True,
|
1271 |
+
temperature=1.0,
|
1272 |
+
min_new_tokens=10,
|
1273 |
+
)
|
1274 |
+
|
1275 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
1276 |
+
thread.start()
|
1277 |
|
1278 |
combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
1279 |
combined_audio = []
|
1280 |
|
1281 |
+
for new_audio in streamer:
|
1282 |
+
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
1283 |
+
combined_audio.extend(new_audio)
|
1284 |
|
1285 |
+
sf.write(combined_audio_path, combined_audio, sampling_rate)
|
1286 |
|
1287 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
1288 |
return combined_audio_path
|
|
|
1356 |
|
1357 |
demo.queue()
|
1358 |
demo.launch(share=True)
|
1359 |
+
|