adi-123 commited on
Commit
f6c162d
Β·
verified Β·
1 Parent(s): f81f2e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -16
app.py CHANGED
@@ -7,13 +7,12 @@ from typing import Dict
7
 
8
  # Image-to-text
9
  def img2txt(url: str) -> str:
10
- print("Initializing captioning model...")
11
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
12
 
13
- print("Generating text from the image...")
14
  text = captioning_model(url, max_new_tokens=20)[0]["generated_text"]
15
 
16
- print(text)
17
  return text
18
 
19
  # Text-to-story
@@ -22,8 +21,7 @@ def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
22
  stream = client.chat.completions.create(
23
  model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
24
  messages=[
25
- {"role": "system", "content": '''As an experienced short story writer, write a story title and then create a meaningful story influenced by the provided prompt.
26
- Ensure the story is full of positive inspiration & enthusiasm and concludes with a happy ending within 250 words. Remember the story must end within 100 words'''},
27
  {"role": "user", "content": prompt}
28
  ],
29
  top_k=top_k,
@@ -35,12 +33,26 @@ def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
35
  story = ''
36
  for chunk in stream:
37
  story += chunk.choices[0].delta.content
 
 
 
 
 
38
 
39
  return story
40
 
 
 
 
 
 
 
 
 
 
41
  # Text-to-speech
42
  def txt2speech(text: str) -> None:
43
- print("Initializing text-to-speech conversion...")
44
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
45
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
46
  payloads = {'inputs': text}
@@ -50,6 +62,7 @@ def txt2speech(text: str) -> None:
50
  with open('audio_story.mp3', 'wb') as file:
51
  file.write(response.content)
52
 
 
53
  def get_user_preferences() -> Dict[str, str]:
54
  preferences = {}
55
 
@@ -67,15 +80,16 @@ def get_user_preferences() -> Dict[str, str]:
67
 
68
  return preferences
69
 
 
70
  def main():
71
  st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="πŸ–ΌοΈ")
72
  st.title("Turn the Image into Audio Story")
73
 
74
  # Allows users to upload an image file
75
- uploaded_file = st.file_uploader("# πŸ“· Upload an image...", type=["jpg", "jpeg", "png"])
76
 
77
  # Parameters for LLM model (in the sidebar)
78
- st.sidebar.markdown("# LLM Inference Configuration Parameters")
79
  top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5)
80
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
81
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
@@ -93,18 +107,21 @@ def main():
93
  st.image(uploaded_file, caption='πŸ–ΌοΈ Uploaded Image', use_column_width=True)
94
 
95
  # Initiates AI processing and story generation
96
- with st.spinner("## πŸ€– AI is at Work! "):
97
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
98
 
99
  # Modify the prompt to include user preferences
100
- prompt = f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. " \
101
- f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. " \
102
- f"The main conflict should be {preferences['conflict']}. " \
103
- f"Include {preferences['magic_tech']} as a key element. " \
104
- f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending."
105
 
106
  story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
107
- txt2speech(story) # Converts the story to audio
 
 
 
108
 
109
  st.markdown("---")
110
  st.markdown("## πŸ“œ Image Caption")
@@ -112,7 +129,7 @@ def main():
112
 
113
  st.markdown("---")
114
  st.markdown("## πŸ“– Story")
115
- st.write(story)
116
 
117
  st.markdown("---")
118
  st.markdown("## 🎧 Audio Story")
 
7
 
8
  # Image-to-text
9
  def img2txt(url: str) -> str:
10
+ st.info("Initializing captioning model...")
11
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
12
 
13
+ st.info("Generating text from the image...")
14
  text = captioning_model(url, max_new_tokens=20)[0]["generated_text"]
15
 
 
16
  return text
17
 
18
  # Text-to-story
 
21
  stream = client.chat.completions.create(
22
  model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
23
  messages=[
24
+ {"role": "system", "content": '''You are a creative story writer. Create a meaningful, positive, and inspirational story based on the provided prompt. Ensure it stays under 250 words and ends on a happy note.'''},
 
25
  {"role": "user", "content": prompt}
26
  ],
27
  top_k=top_k,
 
33
  story = ''
34
  for chunk in stream:
35
  story += chunk.choices[0].delta.content
36
+
37
+ # Enforce 250-word limit
38
+ story_words = story.split()
39
+ if len(story_words) > 250:
40
+ story = ' '.join(story_words[:250]) + '...'
41
 
42
  return story
43
 
44
+ # Translate story
45
+ def translate_story(story: str, target_language: str) -> str:
46
+ if target_language != "English":
47
+ st.info(f"Translating story to {target_language}...")
48
+ translator = pipeline("text2text-generation", model="SnypzZz/Llama2-13b-Language-translate")
49
+ translated_story = translator(story, forced_bos_token_id=target_language)
50
+ return translated_story[0]["generated_text"]
51
+ return story
52
+
53
  # Text-to-speech
54
  def txt2speech(text: str) -> None:
55
+ st.info("Initializing text-to-speech conversion...")
56
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
57
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
58
  payloads = {'inputs': text}
 
62
  with open('audio_story.mp3', 'wb') as file:
63
  file.write(response.content)
64
 
65
+ # User preferences
66
  def get_user_preferences() -> Dict[str, str]:
67
  preferences = {}
68
 
 
80
 
81
  return preferences
82
 
83
+ # Main function
84
  def main():
85
  st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="πŸ–ΌοΈ")
86
  st.title("Turn the Image into Audio Story")
87
 
88
  # Allows users to upload an image file
89
+ uploaded_file = st.file_uploader("πŸ“· Upload an image...", type=["jpg", "jpeg", "png"])
90
 
91
  # Parameters for LLM model (in the sidebar)
92
+ st.sidebar.markdown("## LLM Inference Configuration Parameters")
93
  top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5)
94
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
95
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
 
107
  st.image(uploaded_file, caption='πŸ–ΌοΈ Uploaded Image', use_column_width=True)
108
 
109
  # Initiates AI processing and story generation
110
+ with st.spinner("πŸ€– AI is at Work! "):
111
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
112
 
113
  # Modify the prompt to include user preferences
114
+ prompt = (f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. "
115
+ f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. "
116
+ f"The main conflict should be {preferences['conflict']}. "
117
+ f"Include {preferences['magic_tech']} as a key element. "
118
+ f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending.")
119
 
120
  story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
121
+
122
+ # Translate story based on user-selected language
123
+ translated_story = translate_story(story, preferences['language'])
124
+ txt2speech(translated_story) # Converts the translated story to audio
125
 
126
  st.markdown("---")
127
  st.markdown("## πŸ“œ Image Caption")
 
129
 
130
  st.markdown("---")
131
  st.markdown("## πŸ“– Story")
132
+ st.write(translated_story)
133
 
134
  st.markdown("---")
135
  st.markdown("## 🎧 Audio Story")