import streamlit as st from transformers import pipeline import soundfile as sf import os # Cache models to avoid reloading on every interaction @st.cache_resource def load_models(): return { "image_to_text": pipeline("image-to-text", model="Salesforce/blip-image-captioning-base"), "story_gen": pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct"), "text_to_speech": pipeline("text-to-speech", model="facebook/mms-tts-eng") } # function part def img2text(url, processor): text = processor(url)[0]["generated_text"] return text def text2story(text, generator, max_length=150): prompt = f"Create a story based on: {text}" story = generator( prompt, max_length=max_length, temperature=0.7, do_sample=True )[0]['generated_text'] return story[:1000] # Safety truncation def text2audio(text, synthesizer): audio = synthesizer(text) return audio # main part def main(): st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") # Load models once models = load_models() uploaded_file = st.file_uploader("Select an Image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Save uploaded file temporarily temp_path = f"temp_{uploaded_file.name}" with open(temp_path, "wb") as f: f.write(uploaded_file.getvalue()) st.image(uploaded_file, caption="Uploaded Image", use_column_width=True) # Stage 1: Image to Text with st.spinner('Generating caption...'): scenario = img2text(temp_path, models["image_to_text"]) st.subheader("Image Caption") st.write(scenario) # Stage 2: Text to Story with st.spinner('Creating story...'): story = text2story(scenario, models["story_gen"]) st.subheader("Generated Story") st.write(story) # Stage 3: Story to Audio with st.spinner('Generating audio...'): audio = text2audio(story, models["text_to_speech"]) sf.write("temp_audio.wav", audio["audio"], samplerate=audio["sampling_rate"]) st.subheader("Audio Story") st.audio("temp_audio.wav") # Clean up temp files os.remove(temp_path) os.remove("temp_audio.wav") if __name__ == "__main__": main()