CamiloVega commited on
Commit
ca791f1
·
verified ·
1 Parent(s): 0bc3244

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +336 -1
README.md CHANGED
@@ -4,8 +4,343 @@ emoji: 📚
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.31.5
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.23.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
+
13
+ import os
14
+ import openai
15
+ import whisper
16
+ import tempfile
17
+ import gradio as gr
18
+ from pydub import AudioSegment
19
+ import fitz # PyMuPDF for handling PDFs
20
+ import docx # For handling .docx files
21
+ import pandas as pd # For handling .xlsx and .csv files
22
+ import requests
23
+ from bs4 import BeautifulSoup
24
+ from moviepy.editor import VideoFileClip
25
+ import yt_dlp
26
+ import logging
27
+
28
+ # Configure logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Configure your OpenAI API key
33
+ openai.api_key = os.getenv("OPENAI_API_KEY")
34
+
35
+ # Load the highest quality Whisper model once
36
+ model = whisper.load_model("large")
37
+
38
+ def download_social_media_video(url):
39
+ """Downloads a video from social media."""
40
+ ydl_opts = {
41
+ 'format': 'bestaudio/best',
42
+ 'postprocessors': [{
43
+ 'key': 'FFmpegExtractAudio',
44
+ 'preferredcodec': 'mp3',
45
+ 'preferredquality': '192',
46
+ }],
47
+ 'outtmpl': '%(id)s.%(ext)s',
48
+ }
49
+ try:
50
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
51
+ info_dict = ydl.extract_info(url, download=True)
52
+ audio_file = f"{info_dict['id']}.mp3"
53
+ logger.info(f"Video successfully downloaded: {audio_file}")
54
+ return audio_file
55
+ except Exception as e:
56
+ logger.error(f"Error downloading video: {str(e)}")
57
+ raise
58
+
59
+ def convert_video_to_audio(video_file):
60
+ """Converts a video file to audio."""
61
+ try:
62
+ video = VideoFileClip(video_file)
63
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
64
+ video.audio.write_audiofile(temp_file.name)
65
+ logger.info(f"Video converted to audio: {temp_file.name}")
66
+ return temp_file.name
67
+ except Exception as e:
68
+ logger.error(f"Error converting video to audio: {str(e)}")
69
+ raise
70
+
71
+ def preprocess_audio(audio_file):
72
+ """Preprocesses the audio file to improve quality."""
73
+ try:
74
+ audio = AudioSegment.from_file(audio_file)
75
+ audio = audio.apply_gain(-audio.dBFS + (-20))
76
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
77
+ audio.export(temp_file.name, format="mp3")
78
+ logger.info(f"Audio preprocessed: {temp_file.name}")
79
+ return temp_file.name
80
+ except Exception as e:
81
+ logger.error(f"Error preprocessing audio file: {str(e)}")
82
+ raise
83
+
84
+ def transcribe_audio(file):
85
+ """Transcribes an audio or video file."""
86
+ try:
87
+ if isinstance(file, str) and file.startswith('http'):
88
+ logger.info(f"Downloading social media video: {file}")
89
+ file_path = download_social_media_video(file)
90
+ elif isinstance(file, str) and file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
91
+ logger.info(f"Converting local video to audio: {file}")
92
+ file_path = convert_video_to_audio(file)
93
+ else:
94
+ logger.info(f"Preprocessing audio file: {file}")
95
+ file_path = preprocess_audio(file)
96
+
97
+ logger.info(f"Transcribing audio: {file_path}")
98
+ result = model.transcribe(file_path)
99
+ transcription = result.get("text", "Error in transcription")
100
+ logger.info(f"Transcription completed: {transcription[:50]}...")
101
+ return transcription
102
+ except Exception as e:
103
+ logger.error(f"Error processing file: {str(e)}")
104
+ return f"Error processing file: {str(e)}"
105
+
106
+ def read_document(document_path):
107
+ """Reads content from PDF, DOCX, XLSX or CSV documents."""
108
+ try:
109
+ if document_path.endswith(".pdf"):
110
+ doc = fitz.open(document_path)
111
+ return "\n".join([page.get_text() for page in doc])
112
+ elif document_path.endswith(".docx"):
113
+ doc = docx.Document(document_path)
114
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
115
+ elif document_path.endswith(".xlsx"):
116
+ return pd.read_excel(document_path).to_string()
117
+ elif document_path.endswith(".csv"):
118
+ return pd.read_csv(document_path).to_string()
119
+ else:
120
+ return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
121
+ except Exception as e:
122
+ return f"Error reading document: {str(e)}"
123
+
124
+ def read_url(url):
125
+ """Reads content from a URL."""
126
+ try:
127
+ response = requests.get(url)
128
+ response.raise_for_status()
129
+ soup = BeautifulSoup(response.content, 'html.parser')
130
+ return soup.get_text()
131
+ except Exception as e:
132
+ return f"Error reading URL: {str(e)}"
133
+
134
+ def process_social_content(url):
135
+ """Processes content from a social media URL, handling both text and video."""
136
+ try:
137
+ # First, try to read content as text
138
+ text_content = read_url(url)
139
+
140
+ # Then, try to process as video
141
+ try:
142
+ video_content = transcribe_audio(url)
143
+ except Exception:
144
+ video_content = None
145
+
146
+ return {
147
+ "text": text_content,
148
+ "video": video_content
149
+ }
150
+ except Exception as e:
151
+ logger.error(f"Error processing social content: {str(e)}")
152
+ return None
153
+
154
+ def generate_news(instructions, facts, size, tone, *args):
155
+ """Generates a news article from instructions, facts, URLs, documents, transcriptions, and social media content."""
156
+ knowledge_base = {
157
+ "instructions": instructions,
158
+ "facts": facts,
159
+ "document_content": [],
160
+ "audio_data": [],
161
+ "url_content": [],
162
+ "social_content": []
163
+ }
164
+ num_audios = 5 * 3 # 5 audios/videos * 3 fields (file, name, position)
165
+ num_social_urls = 3 * 3 # 3 social media URLs * 3 fields (URL, name, context)
166
+ num_urls = 5 # 5 general URLs
167
+ audios = args[:num_audios]
168
+ social_urls = args[num_audios:num_audios+num_social_urls]
169
+ urls = args[num_audios+num_social_urls:num_audios+num_social_urls+num_urls]
170
+ documents = args[num_audios+num_social_urls+num_urls:]
171
+
172
+ for url in urls:
173
+ if url:
174
+ knowledge_base["url_content"].append(read_url(url))
175
+
176
+ for document in documents:
177
+ if document is not None:
178
+ knowledge_base["document_content"].append(read_document(document.name))
179
+
180
+ for i in range(0, len(audios), 3):
181
+ audio_file, name, position = audios[i:i+3]
182
+ if audio_file is not None:
183
+ knowledge_base["audio_data"].append({"audio": audio_file, "name": name, "position": position})
184
+
185
+ for i in range(0, len(social_urls), 3):
186
+ social_url, social_name, social_context = social_urls[i:i+3]
187
+ if social_url:
188
+ social_content = process_social_content(social_url)
189
+ if social_content:
190
+ knowledge_base["social_content"].append({
191
+ "url": social_url,
192
+ "name": social_name,
193
+ "context": social_context,
194
+ "text": social_content["text"],
195
+ "video": social_content["video"]
196
+ })
197
+ logger.info(f"Social media content processed: {social_url}")
198
+
199
+ transcriptions_text, raw_transcriptions = "", ""
200
+
201
+ for idx, data in enumerate(knowledge_base["audio_data"]):
202
+ if data["audio"] is not None:
203
+ transcription = transcribe_audio(data["audio"])
204
+ transcription_text = f'"{transcription}" - {data["name"]}, {data["position"]}'
205
+ raw_transcription = f'[Audio/Video {idx + 1}]: "{transcription}" - {data["name"]}, {data["position"]}'
206
+ transcriptions_text += transcription_text + "\n"
207
+ raw_transcriptions += raw_transcription + "\n\n"
208
+
209
+ for data in knowledge_base["social_content"]:
210
+ if data["text"]:
211
+ transcription_text = f'[Social media text]: "{data["text"][:200]}..." - {data["name"]}, {data["context"]}'
212
+ transcriptions_text += transcription_text + "\n"
213
+ raw_transcriptions += transcription_text + "\n\n"
214
+ if data["video"]:
215
+ transcription_video = f'[Social media video]: "{data["video"]}" - {data["name"]}, {data["context"]}'
216
+ transcriptions_text += transcription_video + "\n"
217
+ raw_transcriptions += transcription_video + "\n\n"
218
+
219
+ document_content = "\n\n".join(knowledge_base["document_content"])
220
+ url_content = "\n\n".join(knowledge_base["url_content"])
221
+
222
+ internal_prompt = """
223
+ Instructions for the model:
224
+ - Follow news article principles: answer the 5 Ws in the first paragraph (Who?, What?, When?, Where?, Why?).
225
+ - Ensure at least 80% of quotes are direct and in quotation marks.
226
+ - The remaining 20% can be indirect quotes.
227
+ - Don't invent new information.
228
+ - Be rigorous with provided facts.
229
+ - When processing uploaded documents, extract and highlight important quotes and testimonials from sources.
230
+ - When processing uploaded documents, extract and highlight key figures.
231
+ - Avoid using the date at the beginning of the news body. Start directly with the 5Ws.
232
+ - Include social media content relevantly, citing the source and providing proper context.
233
+ - Make sure to relate the provided context for social media content with its corresponding transcription or text.
234
+ """
235
+
236
+ prompt = f"""
237
+ {internal_prompt}
238
+ Write a news article with the following information, including a title, a 15-word hook (additional information that complements the title), and the content body with {size} words. The tone should be {tone}.
239
+ Instructions: {knowledge_base["instructions"]}
240
+ Facts: {knowledge_base["facts"]}
241
+ Additional content from documents: {document_content}
242
+ Additional content from URLs: {url_content}
243
+ Use the following transcriptions as direct and indirect quotes (without changing or inventing content):
244
+ {transcriptions_text}
245
+ """
246
+
247
+ try:
248
+ response = openai.ChatCompletion.create(
249
+ model="gpt-4o-mini",
250
+ messages=[{"role": "user", "content": prompt}],
251
+ temperature=0.1
252
+ )
253
+ news = response['choices'][0]['message']['content']
254
+ return news, raw_transcriptions
255
+ except Exception as e:
256
+ logger.error(f"Error generating news article: {str(e)}")
257
+ return f"Error generating news article: {str(e)}", ""
258
+
259
+ with gr.Blocks() as demo:
260
+ gr.Markdown("## All-in-One News Generator")
261
+
262
+ # Add tool description and attribution
263
+ gr.Markdown("""
264
+ ### About this tool
265
+
266
+ This AI-powered news generator helps journalists and content creators produce news articles by processing multiple types of input:
267
+ - Audio and video files with automatic transcription
268
+ - Social media content
269
+ - Documents (PDF, DOCX, XLSX, CSV)
270
+ - Web URLs
271
+
272
+ The tool uses advanced AI to generate well-structured news articles following journalistic principles and maintaining the integrity of source quotes.
273
+
274
+ Created by [Camilo Vega](https://www.linkedin.com/in/camilo-vega-169084b1/), AI Consultant
275
+ """)
276
+
277
+ with gr.Row():
278
+ with gr.Column(scale=2):
279
+ instructions = gr.Textbox(label="News article instructions", lines=2)
280
+ facts = gr.Textbox(label="Describe the news facts", lines=4)
281
+ size = gr.Number(label="Content body size (in words)", value=100)
282
+ tone = gr.Dropdown(label="News tone", choices=["serious", "neutral", "lighthearted"], value="neutral")
283
+ with gr.Column(scale=3):
284
+ inputs_list = [instructions, facts, size, tone]
285
+ with gr.Tabs():
286
+ for i in range(1, 6):
287
+ with gr.TabItem(f"Audio/Video {i}"):
288
+ file = gr.File(label=f"Audio/Video {i}", type="filepath", file_types=["audio", "video"])
289
+ name = gr.Textbox(label="Name", scale=1)
290
+ position = gr.Textbox(label="Position", scale=1)
291
+ inputs_list.extend([file, name, position])
292
+ for i in range(1, 4):
293
+ with gr.TabItem(f"Social Media {i}"):
294
+ social_url = gr.Textbox(label=f"Social media URL {i}", lines=1)
295
+ social_name = gr.Textbox(label=f"Person/account name {i}", scale=1)
296
+ social_context = gr.Textbox(label=f"Content context {i}", lines=2)
297
+ inputs_list.extend([social_url, social_name, social_context])
298
+ for i in range(1, 6):
299
+ with gr.TabItem(f"URL {i}"):
300
+ url = gr.Textbox(label=f"URL {i}", lines=1)
301
+ inputs_list.append(url)
302
+ for i in range(1, 6):
303
+ with gr.TabItem(f"Document {i}"):
304
+ document = gr.File(label=f"Document {i}", type="filepath", file_count="single")
305
+ inputs_list.append(document)
306
+
307
+ gr.Markdown("---") # Visual separator
308
+
309
+ with gr.Row():
310
+ transcriptions_output = gr.Textbox(label="Transcriptions", lines=10)
311
+
312
+ gr.Markdown("---") # Visual separator
313
+
314
+ with gr.Row():
315
+ generate = gr.Button("Generate Draft")
316
+ with gr.Row():
317
+ news_output = gr.Textbox(label="Generated Draft", lines=20)
318
+
319
+ generate.click(fn=generate_news, inputs=inputs_list, outputs=[news_output, transcriptions_output])
320
+
321
+ # Add description about how to use the app
322
+ gr.Markdown("""
323
+ ### How to Use This App
324
+
325
+ 1. **Input your requirements:**
326
+ - Enter your news article instructions
327
+ - Describe the key facts of your news story
328
+ - Set the desired word count and tone
329
+
330
+ 2. **Add your sources:**
331
+ - Upload audio/video files for automatic transcription
332
+ - Add social media URLs to extract content
333
+ - Include web URLs for additional information
334
+ - Upload documents (PDF, DOCX, XLSX, CSV) to extract relevant data
335
+
336
+ 3. **Generate your draft:**
337
+ - Click "Generate Draft" to create your news article
338
+ - Review the transcriptions to verify source accuracy
339
+ - Use the generated draft as a starting point for your news story
340
+
341
+ This tool helps streamline the news writing process by automatically gathering, organizing, and synthesizing information from multiple sources into a cohesive article that follows journalistic best practices.
342
+
343
+ Created by [Camilo Vega](https://www.linkedin.com/in/camilo-vega-169084b1/), AI Consultant
344
+ """)
345
+
346
+ demo.launch(share=True)