Nepjune commited on
Commit
829b25c
·
verified ·
1 Parent(s): b81529b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -22
app.py CHANGED
@@ -1,32 +1,38 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
- import torchaudio
4
- from torchaudio.transforms import Resample
5
- import torch
6
 
7
- # สร้างโมเดล TTS
8
- model_name = "facebook/tts-crdnn-baker-softmax"
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
 
12
- # ฟังก์ชันสำหรับแปลงข้อความเป็นเสียง
13
- def text_to_speech(text, output_path="generated_audio.wav"):
14
- input_ids = tokenizer.encode(text, return_tensors="pt", max_length=150, truncation=True)
15
- with torch.no_grad():
16
- audio = model.generate(input_ids)
17
- waveform = torchaudio.transforms.Resample(48_000, 24_000)(audio.squeeze().numpy())
18
- torchaudio.save(output_path, waveform, 24_000)
19
 
20
- def play_audio(audio_path):
21
- gr.audio(audio_path, type="player")
 
 
 
 
22
 
23
- # สร้าง Gradio interface ที่ใช้ image input, textbox output, button และ audio player
 
 
 
 
 
 
24
  demo = gr.Interface(
25
- fn=text_to_speech,
26
- inputs=gr.Textbox(label="Enter Text"),
27
  outputs=[
28
- gr.Audio("audio", type="player"),
29
- gr.Button("Convert to Audio", play_audio),
 
30
  ],
31
  live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
32
  )
 
1
  import gradio as gr
2
+ from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
 
 
3
 
4
+ # Initialize Blip model for image captioning
5
+ model_id = "dblasko/blip-dalle3-img2prompt"
6
+ blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
7
+ blip_processor = BlipProcessor.from_pretrained(model_id)
8
 
9
+ # Initialize TTS model from Hugging Face
10
+ tts_model_name = "tts-mozilla/tts-ljspeech-multilingual"
11
+ tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
12
+ tts_model = AutoModelForSeq2SeqLM.from_pretrained(tts_model_name)
13
+ tts = pipeline(task="text2speech", model=tts_model, tokenizer=tts_tokenizer)
 
 
14
 
15
+ def generate_caption(image):
16
+ # Generate caption from image using Blip model
17
+ inputs = blip_processor(images=image, return_tensors="pt")
18
+ pixel_values = inputs.pixel_values
19
+ generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50)
20
+ generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
21
 
22
+ # Use TTS model to convert generated caption to audio
23
+ audio_output = tts(generated_caption)
24
+ audio_output.save_to_path("generated_audio.mp3")
25
+
26
+ return generated_caption, "generated_audio.mp3"
27
+
28
+ # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
29
  demo = gr.Interface(
30
+ fn=generate_caption,
31
+ inputs=gr.Image(),
32
  outputs=[
33
+ gr.Textbox(label="Generated caption"),
34
+ gr.Button("Convert to Audio"),
35
+ gr.Audio(type="player", label="Generated Audio")
36
  ],
37
  live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
38
  )