Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -638,6 +638,14 @@ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
|
| 638 |
from langchain.agents import Tool, initialize_agent
|
| 639 |
from huggingface_hub import login
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
# Check if the token is already set in the environment variables
|
| 642 |
hf_token = os.getenv("HF_TOKEN")
|
| 643 |
|
|
@@ -952,7 +960,7 @@ def fetch_local_news():
|
|
| 952 |
api_key = os.environ['SERP_API']
|
| 953 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
| 954 |
response = requests.get(url)
|
| 955 |
-
if response.status_code == 200:
|
| 956 |
results = response.json().get("news_results", [])
|
| 957 |
news_html = """
|
| 958 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
@@ -1111,44 +1119,126 @@ def generate_audio_elevenlabs(text):
|
|
| 1111 |
return None
|
| 1112 |
|
| 1113 |
# Changes start here
|
| 1114 |
-
|
| 1115 |
-
def
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1137 |
|
| 1138 |
-
|
| 1139 |
-
|
|
|
|
| 1140 |
|
| 1141 |
-
|
| 1142 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
| 1143 |
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1147 |
|
| 1148 |
-
|
| 1149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1150 |
|
| 1151 |
-
# Function to generate audio using Parler TTS
|
| 1152 |
def generate_audio_parler_tts(text):
|
| 1153 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 1154 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
@@ -1161,28 +1251,38 @@ def generate_audio_parler_tts(text):
|
|
| 1161 |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 1162 |
|
| 1163 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
| 1164 |
|
| 1165 |
-
|
| 1166 |
-
|
|
|
|
| 1167 |
|
| 1168 |
-
|
| 1169 |
-
future_to_chunk = {executor.submit(process_chunk, chunk, model, tokenizer, device): chunk for chunk in text_chunks}
|
| 1170 |
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
|
| 1174 |
-
|
| 1175 |
-
|
| 1176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1177 |
|
| 1178 |
combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
| 1179 |
combined_audio = []
|
| 1180 |
|
| 1181 |
-
for
|
| 1182 |
-
|
| 1183 |
-
combined_audio.extend(
|
| 1184 |
|
| 1185 |
-
sf.write(combined_audio_path, combined_audio,
|
| 1186 |
|
| 1187 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
| 1188 |
return combined_audio_path
|
|
@@ -1256,3 +1356,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
| 1256 |
|
| 1257 |
demo.queue()
|
| 1258 |
demo.launch(share=True)
|
|
|
|
|
|
| 638 |
from langchain.agents import Tool, initialize_agent
|
| 639 |
from huggingface_hub import login
|
| 640 |
|
| 641 |
+
from pydub import AudioSegment
|
| 642 |
+
import io
|
| 643 |
+
import math
|
| 644 |
+
from threading import Thread
|
| 645 |
+
from queue import Queue
|
| 646 |
+
from transformers.generation.streamers import BaseStreamer
|
| 647 |
+
import numpy as np
|
| 648 |
+
|
| 649 |
# Check if the token is already set in the environment variables
|
| 650 |
hf_token = os.getenv("HF_TOKEN")
|
| 651 |
|
|
|
|
| 960 |
api_key = os.environ['SERP_API']
|
| 961 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
| 962 |
response = requests.get(url)
|
| 963 |
+
if response.status_code == 200):
|
| 964 |
results = response.json().get("news_results", [])
|
| 965 |
news_html = """
|
| 966 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
|
|
| 1119 |
return None
|
| 1120 |
|
| 1121 |
# Changes start here
|
| 1122 |
+
class ParlerTTSStreamer(BaseStreamer):
|
| 1123 |
+
def __init__(
|
| 1124 |
+
self,
|
| 1125 |
+
model: ParlerTTSForConditionalGeneration,
|
| 1126 |
+
device: Optional[str] = None,
|
| 1127 |
+
play_steps: Optional[int] = 10,
|
| 1128 |
+
stride: Optional[int] = None,
|
| 1129 |
+
timeout: Optional[float] = None,
|
| 1130 |
+
):
|
| 1131 |
+
self.decoder = model.decoder
|
| 1132 |
+
self.audio_encoder = model.audio_encoder
|
| 1133 |
+
self.generation_config = model.generation_config
|
| 1134 |
+
self.device = device if device is not None else model.device
|
| 1135 |
+
|
| 1136 |
+
self.play_steps = play_steps
|
| 1137 |
+
if stride is not None:
|
| 1138 |
+
self.stride = stride
|
| 1139 |
+
else:
|
| 1140 |
+
hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
|
| 1141 |
+
self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
|
| 1142 |
+
self.token_cache = None
|
| 1143 |
+
self.to_yield = 0
|
| 1144 |
+
|
| 1145 |
+
self.audio_queue = Queue()
|
| 1146 |
+
self.stop_signal = None
|
| 1147 |
+
self.timeout = timeout
|
| 1148 |
+
|
| 1149 |
+
def apply_delay_pattern_mask(self, input_ids):
|
| 1150 |
+
_, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
|
| 1151 |
+
input_ids[:, :1],
|
| 1152 |
+
bos_token_id=self.generation_config.bos_token_id,
|
| 1153 |
+
pad_token_id=self.generation_config.decoder_start_token_id,
|
| 1154 |
+
max_length=input_ids.shape[-1],
|
| 1155 |
+
)
|
| 1156 |
+
input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
|
| 1157 |
|
| 1158 |
+
mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
|
| 1159 |
+
input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
|
| 1160 |
+
input_ids = input_ids[None, ...]
|
| 1161 |
|
| 1162 |
+
input_ids = input_ids.to(self.audio_encoder.device)
|
|
|
|
| 1163 |
|
| 1164 |
+
decode_sequentially = (
|
| 1165 |
+
self.generation_config.bos_token_id in input_ids
|
| 1166 |
+
or self.generation_config.pad_token_id in input_ids
|
| 1167 |
+
or self.generation_config.eos_token_id in input_ids
|
| 1168 |
+
)
|
| 1169 |
+
if not decode_sequentially:
|
| 1170 |
+
output_values = self.audio_encoder.decode(
|
| 1171 |
+
input_ids,
|
| 1172 |
+
audio_scales=[None],
|
| 1173 |
+
)
|
| 1174 |
+
else:
|
| 1175 |
+
sample = input_ids[:, 0]
|
| 1176 |
+
sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
|
| 1177 |
+
sample = sample[:, :, sample_mask]
|
| 1178 |
+
output_values = self.audio_encoder.decode(sample[None, ...], [None])
|
| 1179 |
+
|
| 1180 |
+
audio_values = output_values.audio_values[0, 0]
|
| 1181 |
+
return audio_values.cpu().float().numpy()
|
| 1182 |
+
|
| 1183 |
+
def put(self, value):
|
| 1184 |
+
batch_size = value.shape[0] // self.decoder.num_codebooks
|
| 1185 |
+
if batch_size > 1:
|
| 1186 |
+
raise ValueError("ParlerTTSStreamer only supports batch size 1")
|
| 1187 |
+
|
| 1188 |
+
if self.token_cache is None:
|
| 1189 |
+
self.token_cache = value
|
| 1190 |
+
else:
|
| 1191 |
+
self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
|
| 1192 |
+
|
| 1193 |
+
if self.token_cache.shape[-1] % self.play_steps == 0:
|
| 1194 |
+
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 1195 |
+
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
| 1196 |
+
self.to_yield += len(audio_values) - self.to_yield - self.stride
|
| 1197 |
+
|
| 1198 |
+
def end(self):
|
| 1199 |
+
if self.token_cache is not None:
|
| 1200 |
+
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 1201 |
+
else:
|
| 1202 |
+
audio_values = np.zeros(self.to_yield)
|
| 1203 |
|
| 1204 |
+
self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
|
| 1205 |
+
|
| 1206 |
+
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
| 1207 |
+
self.audio_queue.put(audio, timeout=self.timeout)
|
| 1208 |
+
if stream_end:
|
| 1209 |
+
self.audio_queue.put(self.stop_signal, timeout=self.timeout)
|
| 1210 |
+
|
| 1211 |
+
def __iter__(self):
|
| 1212 |
+
return self
|
| 1213 |
+
|
| 1214 |
+
def __next__(self):
|
| 1215 |
+
value = self.audio_queue.get(timeout=self.timeout)
|
| 1216 |
+
if not isinstance(value, np.ndarray) and value == self.stop_signal:
|
| 1217 |
+
raise StopIteration()
|
| 1218 |
+
else:
|
| 1219 |
+
return value
|
| 1220 |
+
|
| 1221 |
+
def numpy_to_mp3(audio_array, sampling_rate):
|
| 1222 |
+
if np.issubdtype(audio_array.dtype, np.floating):
|
| 1223 |
+
max_val = np.max(np.abs(audio_array))
|
| 1224 |
+
audio_array = (audio_array / max_val) * 32767
|
| 1225 |
+
audio_array = audio_array.astype(np.int16)
|
| 1226 |
+
|
| 1227 |
+
audio_segment = AudioSegment(
|
| 1228 |
+
audio_array.tobytes(),
|
| 1229 |
+
frame_rate=sampling_rate,
|
| 1230 |
+
sample_width=audio_array.dtype.itemsize,
|
| 1231 |
+
channels=1
|
| 1232 |
+
)
|
| 1233 |
+
|
| 1234 |
+
mp3_io = io.BytesIO()
|
| 1235 |
+
audio_segment.export(mp3_io, format="mp3", bitrate="320k")
|
| 1236 |
+
|
| 1237 |
+
mp3_bytes = mp3_io.getvalue()
|
| 1238 |
+
mp3_io.close()
|
| 1239 |
+
|
| 1240 |
+
return mp3_bytes
|
| 1241 |
|
|
|
|
| 1242 |
def generate_audio_parler_tts(text):
|
| 1243 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 1244 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 1251 |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 1252 |
|
| 1253 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 1254 |
+
sampling_rate = model.audio_encoder.config.sampling_rate
|
| 1255 |
+
frame_rate = model.audio_encoder.config.frame_rate
|
| 1256 |
|
| 1257 |
+
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 1258 |
+
play_steps_in_s = 2.0
|
| 1259 |
+
play_steps = int(frame_rate * play_steps_in_s)
|
| 1260 |
|
| 1261 |
+
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
|
|
|
| 1262 |
|
| 1263 |
+
inputs = tokenizer(description, return_tensors="pt").to(device)
|
| 1264 |
+
prompt = tokenizer(text, return_tensors="pt").to(device)
|
| 1265 |
+
|
| 1266 |
+
generation_kwargs = dict(
|
| 1267 |
+
input_ids=inputs.input_ids,
|
| 1268 |
+
prompt_input_ids=prompt.input_ids,
|
| 1269 |
+
streamer=streamer,
|
| 1270 |
+
do_sample=True,
|
| 1271 |
+
temperature=1.0,
|
| 1272 |
+
min_new_tokens=10,
|
| 1273 |
+
)
|
| 1274 |
+
|
| 1275 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 1276 |
+
thread.start()
|
| 1277 |
|
| 1278 |
combined_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
| 1279 |
combined_audio = []
|
| 1280 |
|
| 1281 |
+
for new_audio in streamer:
|
| 1282 |
+
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
| 1283 |
+
combined_audio.extend(new_audio)
|
| 1284 |
|
| 1285 |
+
sf.write(combined_audio_path, combined_audio, sampling_rate)
|
| 1286 |
|
| 1287 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
| 1288 |
return combined_audio_path
|
|
|
|
| 1356 |
|
| 1357 |
demo.queue()
|
| 1358 |
demo.launch(share=True)
|
| 1359 |
+
|