pm6six commited on
Commit
f5bee16
·
verified ·
1 Parent(s): 86ec6d2

Update audio.py

Browse files
Files changed (1) hide show
  1. audio.py +23 -36
audio.py CHANGED
@@ -1,51 +1,38 @@
 
1
  from io import BytesIO
2
  from urllib.request import urlopen
3
  import librosa
4
- from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor, pipeline
5
  import pyttsx3 # For text-to-speech
6
 
7
  # Load Qwen2Audio model and processor
8
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
9
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
10
-
11
- # Initialize TTS engine
12
  tts_engine = pyttsx3.init()
13
 
14
- # Sample conversation with audio input
15
- conversation = [
16
- {"role": "user", "content": [
17
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
18
- ]},
19
- {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
20
- {"role": "user", "content": [
21
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
22
- ]},
23
- ]
24
 
25
- # Preprocess conversation
26
- text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
27
- audios = []
28
- for message in conversation:
29
- if isinstance(message["content"], list):
30
- for ele in message["content"]:
31
- if ele["type"] == "audio":
32
- audios.append(librosa.load(
33
- BytesIO(urlopen(ele['audio_url']).read()),
34
- sr=processor.feature_extractor.sampling_rate)[0]
35
- )
36
 
37
- # Prepare model inputs
38
- inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
39
- inputs.input_ids = inputs.input_ids.to("cuda")
 
40
 
41
- # Generate response
42
- generate_ids = model.generate(**inputs, max_length=256)
43
- generate_ids = generate_ids[:, inputs.input_ids.size(1):]
44
 
45
- # Decode response
46
- response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
47
- print("Model Response:", response)
48
 
49
- # Convert response to speech
50
- tts_engine.say(response)
51
- tts_engine.runAndWait()
 
 
1
+ import streamlit as st
2
  from io import BytesIO
3
  from urllib.request import urlopen
4
  import librosa
5
+ from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
6
  import pyttsx3 # For text-to-speech
7
 
8
  # Load Qwen2Audio model and processor
9
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
10
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 
 
11
  tts_engine = pyttsx3.init()
12
 
13
+ # Streamlit app UI
14
+ st.title("Text-to-Audio App")
15
+ st.text("This app generates audio from text input using Hugging Face models.")
 
 
 
 
 
 
 
16
 
17
+ # User input
18
+ text_input = st.text_area("Enter some text for the model:")
19
+ if st.button("Generate Audio"):
20
+ conversation = [{"role": "user", "content": text_input}]
 
 
 
 
 
 
 
21
 
22
+ # Preprocess conversation
23
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
24
+ inputs = processor(text=text, return_tensors="pt", padding=True)
25
+ inputs.input_ids = inputs.input_ids.to("cuda")
26
 
27
+ # Generate response
28
+ generate_ids = model.generate(**inputs, max_length=256)
29
+ generate_ids = generate_ids[:, inputs.input_ids.size(1):]
30
 
31
+ # Decode response
32
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
33
+ st.text(f"Model Response: {response}")
34
 
35
+ # Convert response to speech
36
+ tts_engine.say(response)
37
+ tts_engine.runAndWait()
38
+ st.success("Audio generated and played!")