Nepjune's picture
Update app.py
e5af486 verified
raw
history blame
2.45 kB
from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor
import torchaudio
from torchaudio.transforms import Resample
import torch
import gradio as gr
from flask import Flask, jsonify, render_template_string
# Initialize TTS model from Hugging Face
tts_model_name = "Kamonwan/blip-image-captioning-new"
tts = pipeline(task="text-to-speech", model=tts_model_name)
# Initialize Blip model for image captioning
model_id = "Kamonwan/blip-image-captioning-new"
blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
blip_processor = BlipProcessor.from_pretrained(model_id)
app = Flask(__name__)
def generate_caption(image):
# Generate caption from image using Blip model
inputs = blip_processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values
generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50)
generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
# Use TTS model to convert generated caption to audio
audio_output = tts(generated_caption)
audio_path = "generated_audio_resampled.wav"
torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
return generated_caption, audio_path
@app.route('/generate_caption', methods=['POST'])
def generate_caption_api():
image = request.files['image'].read()
generated_caption, audio_path = generate_caption(image)
return jsonify({'generated_caption': generated_caption, 'audio_path': audio_path})
@app.route('/')
def index():
return render_template_string("""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Gradio Interface</title>
</head>
<body>
<h1>Gradio Interface</h1>
{{ gr_interface|safe }}
</body>
</html>
""", gr_interface=demo.get_interface())
if __name__ == '__main__':
demo = gr.Interface(
fn=generate_caption,
inputs=gr.Image(),
outputs=[
gr.Textbox(label="Generated caption"),
gr.Button("Convert to Audio"),
gr.Audio(type="filepath", label="Generated Audio")
],
live=True
)
# Start Gradio interface
demo.launch(share=True)
# Start Flask app
app.run(host='0.0.0.0', port=5000)