Spaces:
Sleeping
Sleeping
File size: 3,246 Bytes
d4ff40b 348f72a d4ff40b 348f72a d4ff40b 2b97f42 d4ff40b 348f72a d4ff40b 348f72a d4ff40b 348f72a 8d03f81 348f72a 8d03f81 d4ff40b 7d9cbb3 d4ff40b 7d9cbb3 348f72a d4ff40b 348f72a d4ff40b cd9cb87 d4ff40b cd9cb87 d4ff40b 8d03f81 d4ff40b cd9cb87 d4ff40b cd9cb87 d4ff40b cd9cb87 d4ff40b cd9cb87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# import part
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
import tempfile
import os
from PIL import Image
import string
# Initialize pipelines with caching
@st.cache_resource
def load_pipelines():
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
storyer = pipeline("text-generation", model="gpt2")
tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
return captioner, storyer, tts
captioner, storyer, tts = load_pipelines()
# Function part
# Function to generate content from an image
def generate_content(image):
pil_image = Image.open(image)
# Generate caption
caption = captioner(pil_image)[0]["generated_text"]
st.write("**π What's in the picture: π**")
st.write(caption)
# Create prompt for story
prompt = (
f"Write a funny, interesting children's story centered on this scene: {caption}\n"
f"Story in third-person narrative, describing this scene exactly: {caption} "
f"Mention the exact place, location, or venue within {caption}. "
f"Avoid numbers, random letter combinations, and single-letter words.")
# Generate raw story with optimized parameters
raw = storyer(
prompt,
max_new_tokens=100,
temperature=0.6,
top_p=0.85,
no_repeat_ngram_size=0,
return_full_text=False
)[0]["generated_text"].strip()
# Combine cleaning and word trimming in one step
# Use regex to keep only allowed characters and remove single-letter words
allowed_pattern = re.compile(r'[a-zA-Z0-9.,!?"\'-]+\b(?<!\b\w\b)')
clean_raw = ' '.join(word for word in re.findall(allowed_pattern, raw) if len(word) > 1)
# Split into words and trim to 100 words
words = clean_raw.split()
story = " ".join(words[:100])
story = clean_generated_story(raw)
st.write("**π Your funny story: π**")
st.write(story)
return story
# Generate audio from cleaned story
chunks = textwrap.wrap(story, width=200)
audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
# Save audio to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
temp_file_path = temp_file.name
return caption, story, temp_file_path
# Streamlit UI
st.title("πStory Maker")
st.markdown("Upload a picture, I will generate a story for you")
uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])
# Streamlit UI (modified image display section)
if uploaded_image is None:
st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here!", use_container_width=True)
else:
st.image(uploaded_image, caption="Your Picture ", use_container_width=True)
if st.button("Generate a story"):
if uploaded_image is not None:
with st.spinner("Processing"):
caption, story, audio_path = generate_content(uploaded_image)
st.success(" Your story is ready!π")
st.audio(audio_path, format="audio/wav")
os.remove(audio_path)
else:
st.warning("Please upload a picture first! ") |