adi-123 commited on
Commit
ad83944
·
verified ·
1 Parent(s): f0dbb23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -30
app.py CHANGED
@@ -1,17 +1,12 @@
1
- # Imports
2
- import spaces
3
  import os
4
  import streamlit as st
5
  import requests
6
  from transformers import pipeline
7
- import openai
8
-
9
- # Suppressing all warnings
10
- import warnings
11
- warnings.filterwarnings("ignore")
12
 
13
  # Image-to-text
14
- def img2txt(url):
15
  print("Initializing captioning model...")
16
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
17
 
@@ -22,42 +17,54 @@ def img2txt(url):
22
  return text
23
 
24
  # Text-to-story
25
- def txt2story(img_text, top_k, top_p, temperature):
26
-
27
- headers = {"Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"}
28
-
29
- data = {
30
- "model": "togethercomputer/llama-2-70b-chat",
31
- "messages": [
32
- {"role": "system", "content": '''As an experienced short story writer, write story title and then create a meaningful story influenced by provided words.
33
- Ensure stories conclude positively within 100 words. Remember the story must end within 100 words''', "temperature": temperature},
34
- {"role": "user", "content": f"Here is input set of words: {img_text}", "temperature": temperature}
35
  ],
36
- "top_k": top_k,
37
- "top_p": top_p,
38
- "temperature": temperature
39
- }
 
40
 
41
- response = requests.post("https://api.together.xyz/inference", headers=headers, json=data)
 
 
42
 
43
- story = response.json()["output"]["choices"][0]["text"]
44
  return story
45
 
46
-
47
  # Text-to-speech
48
- def txt2speech(text):
49
  print("Initializing text-to-speech conversion...")
50
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
51
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
52
  payloads = {'inputs': text}
53
-
54
  response = requests.post(API_URL, headers=headers, json=payloads)
55
 
56
  with open('audio_story.mp3', 'wb') as file:
57
  file.write(response.content)
58
 
59
-
60
- # Streamlit web app main function
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def main():
62
  st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="🖼️")
63
  st.title("Turn the Image into Audio Story")
@@ -71,6 +78,10 @@ def main():
71
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
72
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
73
 
 
 
 
 
74
  if uploaded_file is not None:
75
  # Reads and saves uploaded image file
76
  bytes_data = uploaded_file.read()
@@ -82,7 +93,15 @@ def main():
82
  # Initiates AI processing and story generation
83
  with st.spinner("## 🤖 AI is at Work! "):
84
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
85
- story = txt2story(scenario, top_k, top_p, temperature) # Generates a story based on the image text, LLM params
 
 
 
 
 
 
 
 
86
  txt2speech(story) # Converts the story to audio
87
 
88
  st.markdown("---")
 
 
 
1
  import os
2
  import streamlit as st
3
  import requests
4
  from transformers import pipeline
5
+ from together import Together
6
+ from typing import Dict
 
 
 
7
 
8
  # Image-to-text
9
+ def img2txt(url: str) -> str:
10
  print("Initializing captioning model...")
11
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
12
 
 
17
  return text
18
 
19
  # Text-to-story
20
+ def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
21
+ client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
22
+ stream = client.chat.completions.create(
23
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
24
+ messages=[
25
+ {"role": "system", "content": '''As an experienced short story writer, write a story title and then create a meaningful story influenced by the provided prompt.
26
+ Ensure the story is full of positive inspiration & enthusiasm and concludes with a happy ending within 250 words. Remember the story must end within 100 words'''},
27
+ {"role": "user", "content": prompt}
 
 
28
  ],
29
+ top_k=top_k,
30
+ top_p=top_p,
31
+ temperature=temperature,
32
+ stream=True
33
+ )
34
 
35
+ story = ''
36
+ for chunk in stream:
37
+ story += chunk.choices[0].delta.content
38
 
 
39
  return story
40
 
 
41
  # Text-to-speech
42
+ def txt2speech(text: str) -> None:
43
  print("Initializing text-to-speech conversion...")
44
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
45
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
46
  payloads = {'inputs': text}
47
+
48
  response = requests.post(API_URL, headers=headers, json=payloads)
49
 
50
  with open('audio_story.mp3', 'wb') as file:
51
  file.write(response.content)
52
 
53
+ def get_user_preferences() -> Dict[str, str]:
54
+ preferences = {}
55
+
56
+ preferences['genre'] = st.selectbox("Genre", ["Science Fiction", "Fantasy", "Mystery", "Romance"])
57
+ preferences['setting'] = st.selectbox("Setting", ["Future", "Medieval times", "Modern day", "Alternate reality"])
58
+ preferences['plot'] = st.selectbox("Plot", ["Hero's journey", "Solving a mystery", "Love story", "Survival"])
59
+ preferences['tone'] = st.selectbox("Tone", ["Serious", "Light-hearted", "Humorous", "Dark"])
60
+ preferences['theme'] = st.selectbox("Theme", ["Self-discovery", "Redemption", "Love", "Justice"])
61
+ preferences['conflict'] = st.selectbox("Conflict Type", ["Person vs. Society", "Internal struggle", "Person vs. Nature", "Person vs. Person"])
62
+ preferences['magic_tech'] = st.selectbox("Magic/Technology", ["Advanced technology", "Magic system", "Supernatural abilities", "Alien technology"])
63
+ preferences['twist'] = st.selectbox("Mystery/Twist", ["Plot twist", "Hidden identity", "Unexpected ally/enemy", "Time paradox"])
64
+ preferences['ending'] = st.selectbox("Ending", ["Bittersweet", "Happy", "Open-ended", "Tragic"])
65
+
66
+ return preferences
67
+
68
  def main():
69
  st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="🖼️")
70
  st.title("Turn the Image into Audio Story")
 
78
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
79
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
80
 
81
+ # Get user preferences for the story
82
+ st.markdown("## Story Preferences")
83
+ preferences = get_user_preferences()
84
+
85
  if uploaded_file is not None:
86
  # Reads and saves uploaded image file
87
  bytes_data = uploaded_file.read()
 
93
  # Initiates AI processing and story generation
94
  with st.spinner("## 🤖 AI is at Work! "):
95
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
96
+
97
+ # Modify the prompt to include user preferences
98
+ prompt = f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. " \
99
+ f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. " \
100
+ f"The main conflict should be {preferences['conflict']}. " \
101
+ f"Include {preferences['magic_tech']} as a key element. " \
102
+ f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending."
103
+
104
+ story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
105
  txt2speech(story) # Converts the story to audio
106
 
107
  st.markdown("---")