|
import os |
|
import streamlit as st |
|
import requests |
|
from transformers import pipeline |
|
from together import Together |
|
from typing import Dict |
|
|
|
|
|
def img2txt(url: str) -> str: |
|
print("Initializing captioning model...") |
|
captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") |
|
|
|
print("Generating text from the image...") |
|
text = captioning_model(url, max_new_tokens=20)[0]["generated_text"] |
|
|
|
print(text) |
|
return text |
|
|
|
|
|
def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str: |
|
client = Together(api_key=os.environ.get("TOGETHER_API_KEY")) |
|
stream = client.chat.completions.create( |
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", |
|
messages=[ |
|
{"role": "system", "content": '''As an experienced short story writer, write a story title and then create a meaningful story influenced by the provided prompt. |
|
Ensure the story is full of positive inspiration & enthusiasm and concludes with a happy ending within 250 words. Remember the story must end within 100 words'''}, |
|
{"role": "user", "content": prompt} |
|
], |
|
top_k=top_k, |
|
top_p=top_p, |
|
temperature=temperature, |
|
stream=True |
|
) |
|
|
|
story = '' |
|
for chunk in stream: |
|
story += chunk.choices[0].delta.content |
|
|
|
return story |
|
|
|
|
|
def txt2speech(text: str) -> None: |
|
print("Initializing text-to-speech conversion...") |
|
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" |
|
headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"} |
|
payloads = {'inputs': text} |
|
|
|
response = requests.post(API_URL, headers=headers, json=payloads) |
|
|
|
with open('audio_story.mp3', 'wb') as file: |
|
file.write(response.content) |
|
|
|
def get_user_preferences() -> Dict[str, str]: |
|
preferences = {} |
|
|
|
preferences['genre'] = st.selectbox("Genre", ["Science Fiction", "Fantasy", "Mystery", "Romance"]) |
|
preferences['setting'] = st.selectbox("Setting", ["Future", "Medieval times", "Modern day", "Alternate reality"]) |
|
preferences['plot'] = st.selectbox("Plot", ["Hero's journey", "Solving a mystery", "Love story", "Survival"]) |
|
preferences['tone'] = st.selectbox("Tone", ["Serious", "Light-hearted", "Humorous", "Dark"]) |
|
preferences['theme'] = st.selectbox("Theme", ["Self-discovery", "Redemption", "Love", "Justice"]) |
|
preferences['conflict'] = st.selectbox("Conflict Type", ["Person vs. Society", "Internal struggle", "Person vs. Nature", "Person vs. Person"]) |
|
preferences['magic_tech'] = st.selectbox("Magic/Technology", ["Advanced technology", "Magic system", "Supernatural abilities", "Alien technology"]) |
|
preferences['twist'] = st.selectbox("Mystery/Twist", ["Plot twist", "Hidden identity", "Unexpected ally/enemy", "Time paradox"]) |
|
preferences['ending'] = st.selectbox("Ending", ["Bittersweet", "Happy", "Open-ended", "Tragic"]) |
|
|
|
return preferences |
|
|
|
def main(): |
|
st.set_page_config(page_title="π¨ Image-to-Audio Story π§", page_icon="πΌοΈ") |
|
st.title("Turn the Image into Audio Story") |
|
|
|
|
|
uploaded_file = st.file_uploader("# π· Upload an image...", type=["jpg", "jpeg", "png"]) |
|
|
|
|
|
st.sidebar.markdown("# LLM Inference Configuration Parameters") |
|
top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5) |
|
top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8) |
|
temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5) |
|
|
|
|
|
st.markdown("## Story Preferences") |
|
preferences = get_user_preferences() |
|
|
|
if uploaded_file is not None: |
|
|
|
bytes_data = uploaded_file.read() |
|
with open("uploaded_image.jpg", "wb") as file: |
|
file.write(bytes_data) |
|
|
|
st.image(uploaded_file, caption='πΌοΈ Uploaded Image', use_column_width=True) |
|
|
|
|
|
with st.spinner("## π€ AI is at Work! "): |
|
scenario = img2txt("uploaded_image.jpg") |
|
|
|
|
|
prompt = f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. " \ |
|
f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. " \ |
|
f"The main conflict should be {preferences['conflict']}. " \ |
|
f"Include {preferences['magic_tech']} as a key element. " \ |
|
f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending." |
|
|
|
story = txt2story(prompt, top_k, top_p, temperature) |
|
txt2speech(story) |
|
|
|
st.markdown("---") |
|
st.markdown("## π Image Caption") |
|
st.write(scenario) |
|
|
|
st.markdown("---") |
|
st.markdown("## π Story") |
|
st.write(story) |
|
|
|
st.markdown("---") |
|
st.markdown("## π§ Audio Story") |
|
st.audio("audio_story.mp3") |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|
|
st.markdown("### Credits") |
|
st.caption(''' |
|
Made with β€οΈ by @Aditya-Neural-Net-Ninja\n |
|
Utilizes Image-to-Text, Text Generation, Text-to-Speech Transformer Models\n |
|
Gratitude to Streamlit, π€ Spaces for Deployment & Hosting |
|
''') |