Spaces:

manuelcozar55
/

LexAIcon_Mistral7B

Paused

File size: 5,216 Bytes

23a1957
 
 
7413e10
23a1957
 
 
dcd74ef
23a1957
 
 
 
e988252
 
cdd50f7
cc439f4
a80aa7b
23a1957
 
 
 
 
 
7413e10
23a1957
 
 
 
 
 
 
 
 
 
 
 
 
 
cc439f4
b169e9b
23a1957
 
 
 
 
 
 
b169e9b
 
a80aa7b
23a1957
b169e9b
23a1957
b169e9b
a80aa7b
23a1957
7413e10
23a1957
 
 
7413e10
23a1957
 
 
 
a80aa7b
cc439f4
23a1957
 
 
 
 
 
 
 
6a25acd
cc439f4
23a1957
 
 
 
 
 
 
 
7413e10
23a1957
a80aa7b
23a1957
 
 
a80aa7b
23a1957
 
 
 
 
 
 
 
 
 
a80aa7b
23a1957
b169e9b
23a1957
b169e9b
23a1957
 
 
 
 
 
 
6a25acd
23a1957
 
 
 
 
 
ba072b0
42d7a05
23a1957
 
 
 
 
 
 
 
 
 
b169e9b
23a1957
b169e9b
23a1957
b169e9b
ba072b0
b169e9b
23a1957
 
b169e9b
23a1957
 
5cfd5eb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from huggingface_hub import login
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os

login(token=os.getenv('HUGGINGFACE_TOKEN'))

# Configuración del modelo LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=4096,
    temperature=0.5,
    do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)

# Configuración del modelo de clasificación
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")

id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}

def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text

def summarize(file):
    # Leer el contenido del archivo subido
    file_path = file.name
    if file_path.endswith('.pdf'):
        text = read_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
    template = '''
    Por favor, lea detenidamente el siguiente documento:
<document>
{TEXT}
</document>
Después de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. Organice estos puntos clave en una lista con viñetas concisa que resuma la información esencial del documento. El resumen debe tener un máximo de 10 puntos.
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que también es conciso en la expresión de cada punto del resumen. Omita los detalles menores y concéntrese en los temas centrales y hechos importantes.
    '''
    
    prompt = PromptTemplate(
        template=template,
        input_variables=['TEXT']    
    )
    
    formatted_prompt = prompt.format(TEXT=text)
    output_summary = llm_engine_hf.invoke(formatted_prompt)
    
    return f"Prompt:\n{formatted_prompt}\n\nResumen:\n{output_summary.content}"

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(dim=-1).item()
    predicted_label = id2label[predicted_class_id]
    return predicted_label

def translate(file, target_language):
    # Leer el contenido del archivo subido
    file_path = file.name
    if file_path.endswith('.pdf'):
        text = read_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

    template = '''
    Por favor, traduzca el siguiente documento al {LANGUAGE}:
<document>
{TEXT}
</document>
Asegúrese de que la traducción sea precisa y conserve el significado original del documento.
    '''
    
    prompt = PromptTemplate(
        template=template,
        input_variables=['TEXT', 'LANGUAGE']    
    )
    
    formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
    translated_text = llm_engine_hf.invoke(formatted_prompt)
    
    return f"Prompt:\n{formatted_prompt}\n\nTraducción:\n{translated_text.content}"

def process_file(file, action, target_language=None):
    if action == "Resumen":
        return summarize(file)
    elif action == "Clasificar":
        file_path = file.name
        if file_path.endswith('.pdf'):
            text = read_pdf(file_path)
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        return classify_text(text)
    elif action == "Traducir":
        return translate(file, target_language)
    else:
        return "Acción no válida"

# Crear la interfaz de Gradio
with gr.Blocks() as demo:
    gr.Markdown("## Procesador de Documentos")

    with gr.Row():
        with gr.Column():
            file = gr.File(label="Subir un archivo")
            action = gr.Radio(label="Seleccione una acción", choices=["Resumen", "Clasificar", "Traducir"])
            target_language = gr.Dropdown(label="Seleccionar idioma de traducción", choices=["en", "fr", "de"], visible=False)
        
        with gr.Column():
            output_text = gr.Textbox(label="Resultado", lines=20)

    def update_language_dropdown(action):
        if action == "Traducir":
            return gr.update(visible=True)
        else:
            return gr.update(visible=False)
    
    action.change(update_language_dropdown, inputs=action, outputs=target_language)
    
    submit_button = gr.Button("Procesar")
    submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)

# Ejecutar la aplicación Gradio
demo.launch(share=True)