Update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,12 @@
|
|
1 |
-
# Imports
|
2 |
-
import spaces
|
3 |
import os
|
4 |
import streamlit as st
|
5 |
import requests
|
6 |
from transformers import pipeline
|
7 |
-
import
|
8 |
-
|
9 |
-
# Suppressing all warnings
|
10 |
-
import warnings
|
11 |
-
warnings.filterwarnings("ignore")
|
12 |
|
13 |
# Image-to-text
|
14 |
-
def img2txt(url):
|
15 |
print("Initializing captioning model...")
|
16 |
captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
17 |
|
@@ -22,42 +17,54 @@ def img2txt(url):
|
|
22 |
return text
|
23 |
|
24 |
# Text-to-story
|
25 |
-
def txt2story(
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
{"role": "
|
33 |
-
Ensure stories conclude positively within 100 words. Remember the story must end within 100 words''', "temperature": temperature},
|
34 |
-
{"role": "user", "content": f"Here is input set of words: {img_text}", "temperature": temperature}
|
35 |
],
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
40 |
|
41 |
-
|
|
|
|
|
42 |
|
43 |
-
story = response.json()["output"]["choices"][0]["text"]
|
44 |
return story
|
45 |
|
46 |
-
|
47 |
# Text-to-speech
|
48 |
-
def txt2speech(text):
|
49 |
print("Initializing text-to-speech conversion...")
|
50 |
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
|
51 |
headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
|
52 |
payloads = {'inputs': text}
|
53 |
-
|
54 |
response = requests.post(API_URL, headers=headers, json=payloads)
|
55 |
|
56 |
with open('audio_story.mp3', 'wb') as file:
|
57 |
file.write(response.content)
|
58 |
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def main():
|
62 |
st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="🖼️")
|
63 |
st.title("Turn the Image into Audio Story")
|
@@ -71,6 +78,10 @@ def main():
|
|
71 |
top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
|
72 |
temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
|
73 |
|
|
|
|
|
|
|
|
|
74 |
if uploaded_file is not None:
|
75 |
# Reads and saves uploaded image file
|
76 |
bytes_data = uploaded_file.read()
|
@@ -82,7 +93,15 @@ def main():
|
|
82 |
# Initiates AI processing and story generation
|
83 |
with st.spinner("## 🤖 AI is at Work! "):
|
84 |
scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
txt2speech(story) # Converts the story to audio
|
87 |
|
88 |
st.markdown("---")
|
|
|
|
|
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
import requests
|
4 |
from transformers import pipeline
|
5 |
+
from together import Together
|
6 |
+
from typing import Dict
|
|
|
|
|
|
|
7 |
|
8 |
# Image-to-text
|
9 |
+
def img2txt(url: str) -> str:
|
10 |
print("Initializing captioning model...")
|
11 |
captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
12 |
|
|
|
17 |
return text
|
18 |
|
19 |
# Text-to-story
|
20 |
+
def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
|
21 |
+
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
|
22 |
+
stream = client.chat.completions.create(
|
23 |
+
model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
24 |
+
messages=[
|
25 |
+
{"role": "system", "content": '''As an experienced short story writer, write a story title and then create a meaningful story influenced by the provided prompt.
|
26 |
+
Ensure the story is full of positive inspiration & enthusiasm and concludes with a happy ending within 250 words. Remember the story must end within 100 words'''},
|
27 |
+
{"role": "user", "content": prompt}
|
|
|
|
|
28 |
],
|
29 |
+
top_k=top_k,
|
30 |
+
top_p=top_p,
|
31 |
+
temperature=temperature,
|
32 |
+
stream=True
|
33 |
+
)
|
34 |
|
35 |
+
story = ''
|
36 |
+
for chunk in stream:
|
37 |
+
story += chunk.choices[0].delta.content
|
38 |
|
|
|
39 |
return story
|
40 |
|
|
|
41 |
# Text-to-speech
|
42 |
+
def txt2speech(text: str) -> None:
|
43 |
print("Initializing text-to-speech conversion...")
|
44 |
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
|
45 |
headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
|
46 |
payloads = {'inputs': text}
|
47 |
+
|
48 |
response = requests.post(API_URL, headers=headers, json=payloads)
|
49 |
|
50 |
with open('audio_story.mp3', 'wb') as file:
|
51 |
file.write(response.content)
|
52 |
|
53 |
+
def get_user_preferences() -> Dict[str, str]:
|
54 |
+
preferences = {}
|
55 |
+
|
56 |
+
preferences['genre'] = st.selectbox("Genre", ["Science Fiction", "Fantasy", "Mystery", "Romance"])
|
57 |
+
preferences['setting'] = st.selectbox("Setting", ["Future", "Medieval times", "Modern day", "Alternate reality"])
|
58 |
+
preferences['plot'] = st.selectbox("Plot", ["Hero's journey", "Solving a mystery", "Love story", "Survival"])
|
59 |
+
preferences['tone'] = st.selectbox("Tone", ["Serious", "Light-hearted", "Humorous", "Dark"])
|
60 |
+
preferences['theme'] = st.selectbox("Theme", ["Self-discovery", "Redemption", "Love", "Justice"])
|
61 |
+
preferences['conflict'] = st.selectbox("Conflict Type", ["Person vs. Society", "Internal struggle", "Person vs. Nature", "Person vs. Person"])
|
62 |
+
preferences['magic_tech'] = st.selectbox("Magic/Technology", ["Advanced technology", "Magic system", "Supernatural abilities", "Alien technology"])
|
63 |
+
preferences['twist'] = st.selectbox("Mystery/Twist", ["Plot twist", "Hidden identity", "Unexpected ally/enemy", "Time paradox"])
|
64 |
+
preferences['ending'] = st.selectbox("Ending", ["Bittersweet", "Happy", "Open-ended", "Tragic"])
|
65 |
+
|
66 |
+
return preferences
|
67 |
+
|
68 |
def main():
|
69 |
st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="🖼️")
|
70 |
st.title("Turn the Image into Audio Story")
|
|
|
78 |
top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
|
79 |
temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
|
80 |
|
81 |
+
# Get user preferences for the story
|
82 |
+
st.markdown("## Story Preferences")
|
83 |
+
preferences = get_user_preferences()
|
84 |
+
|
85 |
if uploaded_file is not None:
|
86 |
# Reads and saves uploaded image file
|
87 |
bytes_data = uploaded_file.read()
|
|
|
93 |
# Initiates AI processing and story generation
|
94 |
with st.spinner("## 🤖 AI is at Work! "):
|
95 |
scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
|
96 |
+
|
97 |
+
# Modify the prompt to include user preferences
|
98 |
+
prompt = f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. " \
|
99 |
+
f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. " \
|
100 |
+
f"The main conflict should be {preferences['conflict']}. " \
|
101 |
+
f"Include {preferences['magic_tech']} as a key element. " \
|
102 |
+
f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending."
|
103 |
+
|
104 |
+
story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
|
105 |
txt2speech(story) # Converts the story to audio
|
106 |
|
107 |
st.markdown("---")
|