Spaces:

NoticIA-Col
/

Generador-Noticias

Running

App Files Files Community

CamiloVega commited on Aug 12, 2024

Commit

654a56c

verified ·

1 Parent(s): a62e63a

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -98

app.py CHANGED Viewed

@@ -4,24 +4,24 @@ import whisper
 import tempfile
 import gradio as gr
 from pydub import AudioSegment
-import fitz  # PyMuPDF para manejar PDFs
-import docx  # Para manejar archivos .docx
-import pandas as pd  # Para manejar archivos .xlsx y .csv
-# from google.colab import userdata  # Importa userdata de google.colab
 import requests
 from bs4 import BeautifulSoup
-# Configura tu clave API de OpenAI usando Google Colab userdata
 # openai.api_key = userdata.get('OPENAI_API_KEY')
-# Cargar las variables de entorno desde el entorno de Hugging Face
 openai.api_key = os.getenv("OPENAI_API_KEY")
-# Cargar el modelo Whisper de mayor calidad una vez
 model = whisper.load_model("large")
 def preprocess_audio(audio_file):
-    """Preprocesa el archivo de audio para mejorar la calidad."""
     try:
         audio = AudioSegment.from_file(audio_file)
         audio = audio.apply_gain(-audio.dBFS + (-20))
@@ -29,150 +29,150 @@ def preprocess_audio(audio_file):
             audio.export(temp_file.name, format="mp3")
             return temp_file.name
     except Exception as e:
-        return f"Error al preprocesar el archivo de audio: {str(e)}"
-def transcribir_audio(audio_file):
-    """Transcribe un archivo de audio."""
     try:
-        archivo_path = preprocess_audio(audio_file) if isinstance(audio_file, str) else preprocess_audio(tempfile.NamedTemporaryFile(delete=False, suffix=".mp3", mode='w+b').name)
-        resultado = model.transcribe(archivo_path)
-        return resultado.get("text", "Error en la transcripción")
     except Exception as e:
-        return f"Error al procesar el archivo de audio: {str(e)}"
-def leer_documento(documento_path):
-    """Lee el contenido de un documento PDF, DOCX, XLSX o CSV."""
     try:
-        if documento_path.endswith(".pdf"):
-            doc = fitz.open(documento_path)
-            return "\n".join([pagina.get_text() for pagina in doc])
-        elif documento_path.endswith(".docx"):
-            doc = docx.Document(documento_path)
-            return "\n".join([parrafo.text for parrafo in doc.paragraphs])
-        elif documento_path.endswith(".xlsx"):
-            return pd.read_excel(documento_path).to_string()
-        elif documento_path.endswith(".csv"):
-            return pd.read_csv(documento_path).to_string()
         else:
-            return "Tipo de archivo no soportado. Por favor suba un documento PDF, DOCX, XLSX o CSV."
     except Exception as e:
-        return f"Error al leer el documento: {str(e)}"
-def leer_url(url):
-    """Lee el contenido de una URL."""
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         return soup.get_text()
     except Exception as e:
-        return f"Error al leer la URL: {str(e)}"
-def generar_noticia(instrucciones, hechos, tamaño, tono, urls, *args):
-    """Genera una noticia a partir de instrucciones, hechos, URLs, documentos y transcripciones."""
-    base_de_conocimiento = {"instrucciones": instrucciones, "hechos": hechos, "contenido_documentos": [], "audio_data": [], "contenido_urls": []}
-    num_audios = 5 * 3  # 5 audios * 3 campos (audio, nombre, cargo)
     audios = args[:num_audios]
-    documentos = args[num_audios:]
     for url in urls.split():
         if url:
-            base_de_conocimiento["contenido_urls"].append(leer_url(url))
-    for documento in documentos:
-        if documento is not None:
-            base_de_conocimiento["contenido_documentos"].append(leer_documento(documento.name))
     for i in range(0, len(audios), 3):
-        audio_file, nombre, cargo = audios[i:i+3]
         if audio_file is not None:
-            base_de_conocimiento["audio_data"].append({"audio": audio_file, "nombre": nombre, "cargo": cargo})
-    transcripciones_texto, transcripciones_brutas, total_citas_directas = "", "", 0
-    for idx, data in enumerate(base_de_conocimiento["audio_data"]):
         if data["audio"] is not None:
-            transcripcion = transcribir_audio(data["audio"])
-            transcripcion_texto = f'"{transcripcion}" - {data["nombre"]}, {data["cargo"]}'
-            transcripcion_bruta = f'[Audio {idx + 1}]: "{transcripcion}" - {data["nombre"]}, {data["cargo"]}'
-            if total_citas_directas < len(base_de_conocimiento["audio_data"]) * 0.8:
-                transcripciones_texto += transcripcion_texto + "\n"
-                total_citas_directas += 1
             else:
-                transcripciones_texto += f'{data["nombre"]} mencionó que {transcripcion}' + "\n"
-            transcripciones_brutas += transcripcion_bruta + "\n\n"
-    contenido_documentos = "\n\n".join(base_de_conocimiento["contenido_documentos"])
-    contenido_urls = "\n\n".join(base_de_conocimiento["contenido_urls"])
-    prompt_interno = """
-    Instrucciones para el modelo:
-    - Debes seguir los principios de una noticia: es decir, procura siempre responder las 5 W de una noticia en el primer párrafo (Who?, What?, When?, Where?, Why?).
-    - Asegúrate de que al menos el 80% de las citas sean directas y estén entrecomilladas.
-    - El 20% restante puede ser citas indirectas.
-    - No inventes información nueva.
-    - Sé riguroso con los hechos proporcionados.
-    - Al procesar los documentos cargados, extrae y resalta citas importantes y testimonios textuales de las fuentes.
-    - Al procesar los documentos cargados, extrae y resalta cifras clave.
     """
     prompt = f"""
-    {prompt_interno}
-    Escribe una noticia con la siguiente información, incluyendo un título, un gancho de 15 palabras (el gancho es lo que se conoce en inglés como hook, información adicional que complementa el título), y el cuerpo del contenido cuyo tamaño es {tamaño} palabras. El tono debe ser {tono}.
-    Instrucciones: {base_de_conocimiento["instrucciones"]}
-    Hechos: {base_de_conocimiento["hechos"]}
-    Contenido adicional de los documentos: {contenido_documentos}
-    Contenido adicional de las URLs: {contenido_urls}
-    Utiliza las siguientes transcripciones como citas directas e indirectas (sin cambiar ni inventar contenido):
-    {transcripciones_texto}
     """
     try:
-        respuesta = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.1
         )
-        noticia = respuesta['choices'][0]['message']['content']
-        return noticia, transcripciones_brutas
     except Exception as e:
-        return f"Error al generar la noticia: {str(e)}", ""
 with gr.Blocks() as demo:
-    gr.Markdown("## Generador noticias todo en uno")
     with gr.Row():
         with gr.Column(scale=2):
-            instrucciones = gr.Textbox(label="Instrucciones para la noticia", lines=2)
-            hechos = gr.Textbox(label="Describe los hechos de la noticia", lines=4)
-            tamaño = gr.Number(label="Tamaño del cuerpo de la noticia (en palabras)", value=100)
-            tono = gr.Dropdown(label="Tono de la noticia", choices=["serio", "neutral", "divertido"], value="neutral")
-            urls = gr.Textbox(label="URLs (separadas por espacio)", lines=2)
         with gr.Column(scale=3):
-            inputs_list = [instrucciones, hechos, tamaño, tono, urls]
             with gr.Tabs():
                 for i in range(1, 6):
                     with gr.TabItem(f"Audio {i}"):
                         audio = gr.Audio(type="filepath", label=f"Audio {i}")
-                        nombre = gr.Textbox(label="Nombre", scale=1)
-                        cargo = gr.Textbox(label="Cargo", scale=1)
-                        inputs_list.extend([audio, nombre, cargo])
                 for i in range(1, 6):
-                    with gr.TabItem(f"Documento {i}"):
-                        documento = gr.File(label=f"Documento {i}", type="filepath", file_count="single")
-                        inputs_list.append(documento)
-    gr.Markdown("---") # Separador visual
     with gr.Row():
-        transcripciones_output = gr.Textbox(label="Transcripciones", lines=10)
-    gr.Markdown("---") # Separador visual
     with gr.Row():
-        generar = gr.Button("Generar borrador")
     with gr.Row():
-        noticia_output = gr.Textbox(label="Borrador generado", lines=20)
-    generar.click(fn=generar_noticia, inputs=inputs_list, outputs=[noticia_output, transcripciones_output])
-demo.launch(share=True)

 import tempfile
 import gradio as gr
 from pydub import AudioSegment
+import fitz  # PyMuPDF for handling PDFs
+import docx  # For handling .docx files
+import pandas as pd  # For handling .xlsx and .csv files
+# from google.colab import userdata  # Import userdata from google.colab
 import requests
 from bs4 import BeautifulSoup
+# Configure your OpenAI API key using Google Colab userdata
 # openai.api_key = userdata.get('OPENAI_API_KEY')
+# Load environment variables from the Hugging Face environment
 openai.api_key = os.getenv("OPENAI_API_KEY")
+# Load the highest quality Whisper model once
 model = whisper.load_model("large")
 def preprocess_audio(audio_file):
+    """Preprocess the audio file to improve quality."""
     try:
         audio = AudioSegment.from_file(audio_file)
         audio = audio.apply_gain(-audio.dBFS + (-20))
             audio.export(temp_file.name, format="mp3")
             return temp_file.name
     except Exception as e:
+        return f"Error preprocessing the audio file: {str(e)}"
+def transcribe_audio(audio_file):
+    """Transcribe an audio file."""
     try:
+        file_path = preprocess_audio(audio_file) if isinstance(audio_file, str) else preprocess_audio(tempfile.NamedTemporaryFile(delete=False, suffix=".mp3", mode='w+b').name)
+        result = model.transcribe(file_path)
+        return result.get("text", "Error in transcription")
     except Exception as e:
+        return f"Error processing the audio file: {str(e)}"
+def read_document(document_path):
+    """Read the content of a PDF, DOCX, XLSX or CSV document."""
     try:
+        if document_path.endswith(".pdf"):
+            doc = fitz.open(document_path)
+            return "\n".join([page.get_text() for page in doc])
+        elif document_path.endswith(".docx"):
+            doc = docx.Document(document_path)
+            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        elif document_path.endswith(".xlsx"):
+            return pd.read_excel(document_path).to_string()
+        elif document_path.endswith(".csv"):
+            return pd.read_csv(document_path).to_string()
         else:
+            return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
     except Exception as e:
+        return f"Error reading the document: {str(e)}"
+def read_url(url):
+    """Read the content of a URL."""
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         return soup.get_text()
     except Exception as e:
+        return f"Error reading the URL: {str(e)}"
+def generate_news(instructions, facts, size, tone, urls, *args):
+    """Generate a news article based on instructions, facts, URLs, documents, and transcriptions."""
+    knowledge_base = {"instructions": instructions, "facts": facts, "document_content": [], "audio_data": [], "url_content": []}
+    num_audios = 5 * 3  # 5 audios * 3 fields (audio, name, position)
     audios = args[:num_audios]
+    documents = args[num_audios:]
     for url in urls.split():
         if url:
+            knowledge_base["url_content"].append(read_url(url))
+    for document in documents:
+        if document is not None:
+            knowledge_base["document_content"].append(read_document(document.name))
     for i in range(0, len(audios), 3):
+        audio_file, name, position = audios[i:i+3]
         if audio_file is not None:
+            knowledge_base["audio_data"].append({"audio": audio_file, "name": name, "position": position})
+    transcriptions_text, raw_transcriptions, total_direct_quotes = "", "", 0
+    for idx, data in enumerate(knowledge_base["audio_data"]):
         if data["audio"] is not None:
+            transcription = transcribe_audio(data["audio"])
+            transcription_text = f'"{transcription}" - {data["name"]}, {data["position"]}'
+            raw_transcription = f'[Audio {idx + 1}]: "{transcription}" - {data["name"]}, {data["position"]}'
+            if total_direct_quotes < len(knowledge_base["audio_data"]) * 0.8:
+                transcriptions_text += transcription_text + "\n"
+                total_direct_quotes += 1
             else:
+                transcriptions_text += f'{data["name"]} mentioned that {transcription}' + "\n"
+            raw_transcriptions += raw_transcription + "\n\n"
+    document_content = "\n\n".join(knowledge_base["document_content"])
+    url_content = "\n\n".join(knowledge_base["url_content"])
+    internal_prompt = """
+    Instructions for the model:
+    - Follow the principles of news writing: always try to answer the 5 Ws of a news story in the first paragraph (Who?, What?, When?, Where?, Why?).
+    - Ensure that at least 80% of the quotes are direct and in quotation marks.
+    - The remaining 20% can be indirect quotes.
+    - Do not invent new information.
+    - Be rigorous with the provided facts.
+    - When processing uploaded documents, extract and highlight important quotes and verbatim testimonies from sources.
+    - When processing uploaded documents, extract and highlight key figures.
     """
     prompt = f"""
+    {internal_prompt}
+    Write a news article with the following information, including a title, a 15-word hook (additional information that complements the title), and the body content with a size of {size} words. The tone should be {tone}.
+    Instructions: {knowledge_base["instructions"]}
+    Facts: {knowledge_base["facts"]}
+    Additional content from documents: {document_content}
+    Additional content from URLs: {url_content}
+    Use the following transcriptions as direct and indirect quotes (without changing or inventing content):
+    {transcriptions_text}
     """
     try:
+        response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.1
         )
+        news_article = response['choices'][0]['message']['content']
+        return news_article, raw_transcriptions
     except Exception as e:
+        return f"Error generating the news article: {str(e)}", ""
 with gr.Blocks() as demo:
+    gr.Markdown("## All-in-One News Generator")
     with gr.Row():
         with gr.Column(scale=2):
+            instructions = gr.Textbox(label="Instructions for the news article", lines=2)
+            facts = gr.Textbox(label="Describe the facts of the news", lines=4)
+            size = gr.Number(label="Size of the news body (in words)", value=100)
+            tone = gr.Dropdown(label="Tone of the news", choices=["serious", "neutral", "lighthearted"], value="neutral")
+            urls = gr.Textbox(label="URLs (separated by space)", lines=2)
         with gr.Column(scale=3):
+            inputs_list = [instructions, facts, size, tone, urls]
             with gr.Tabs():
                 for i in range(1, 6):
                     with gr.TabItem(f"Audio {i}"):
                         audio = gr.Audio(type="filepath", label=f"Audio {i}")
+                        name = gr.Textbox(label="Name", scale=1)
+                        position = gr.Textbox(label="Position", scale=1)
+                        inputs_list.extend([audio, name, position])
                 for i in range(1, 6):
+                    with gr.TabItem(f"Document {i}"):
+                        document = gr.File(label=f"Document {i}", type="filepath", file_count="single")
+                        inputs_list.append(document)
+    gr.Markdown("---") # Visual separator
     with gr.Row():
+        transcriptions_output = gr.Textbox(label="Transcriptions", lines=10)
+    gr.Markdown("---") # Visual separator
     with gr.Row():
+        generate = gr.Button("Generate draft")
     with gr.Row():
+        news_output = gr.Textbox(label="Generated draft", lines=20)
+    generate.click(fn=generate_news, inputs=inputs_list, outputs=[news_output, transcriptions_output])
+demo.launch(share=True)