|
import warnings |
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
import PyPDF2 |
|
import gradio as gr |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chains.summarize import load_summarize_chain |
|
from huggingface_hub import login |
|
from pathlib import Path |
|
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import os |
|
|
|
login(token=os.getenv('HUGGINGFACE_TOKEN')) |
|
|
|
|
|
llm = HuggingFaceEndpoint( |
|
repo_id="mistralai/Mistral-7B-Instruct-v0.3", |
|
task="text-generation", |
|
max_new_tokens=4096, |
|
temperature=0.5, |
|
do_sample=False, |
|
) |
|
llm_engine_hf = ChatHuggingFace(llm=llm) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") |
|
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") |
|
|
|
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"} |
|
|
|
def read_pdf(file_path): |
|
pdf_reader = PyPDF2.PdfReader(file_path) |
|
text = "" |
|
for page in range(len(pdf_reader.pages)): |
|
text += pdf_reader.pages[page].extract_text() |
|
return text |
|
|
|
def summarize(file, summary_length): |
|
|
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
|
|
if summary_length == 'Corto': |
|
length_instruction = "El resumen debe tener un m谩ximo de 100 palabras." |
|
elif summary_length == 'Medio': |
|
length_instruction = "El resumen debe tener un m谩ximo de 500 palabras." |
|
else: |
|
length_instruction = "El resumen debe tener un m谩ximo de 1000 palabras." |
|
|
|
template = f''' |
|
Por favor, lea detenidamente el siguiente documento: |
|
<document> |
|
{{TEXT}} |
|
</document> |
|
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. Organice estos puntos clave en una lista con vi帽etas concisa que resuma la informaci贸n esencial del documento. {length_instruction} |
|
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes. |
|
''' |
|
|
|
prompt = PromptTemplate( |
|
template=template, |
|
input_variables=['TEXT'] |
|
) |
|
|
|
formatted_prompt = prompt.format(TEXT=text) |
|
output_summary = llm_engine_hf.invoke(formatted_prompt) |
|
|
|
return f"Prompt:\n{formatted_prompt}\n\nResumen:\n{output_summary.content}" |
|
|
|
def classify_text(text): |
|
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length") |
|
model.eval() |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class_id = logits.argmax(dim=-1).item() |
|
predicted_label = id2label[predicted_class_id] |
|
return predicted_label, text |
|
|
|
def translate(file, target_language): |
|
|
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
|
|
template = ''' |
|
Por favor, traduzca el siguiente documento al {LANGUAGE}: |
|
<document> |
|
{TEXT} |
|
</document> |
|
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento. |
|
''' |
|
|
|
prompt = PromptTemplate( |
|
template=template, |
|
input_variables=['TEXT', 'LANGUAGE'] |
|
) |
|
|
|
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language) |
|
translated_text = llm_engine_hf.invoke(formatted_prompt) |
|
|
|
return f"Prompt:\n{formatted_prompt}\n\nTraducci贸n:\n{translated_text.content}" |
|
|
|
def process_file(file, action, target_language=None, summary_length=None): |
|
if action == "Resumen": |
|
return summarize(file, summary_length) |
|
elif action == "Clasificar": |
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
label, document_text = classify_text(text) |
|
return f"Clasificaci贸n: {label}\n\nDocumento:\n{document_text}" |
|
elif action == "Traducir": |
|
return translate(file, target_language) |
|
else: |
|
return "Acci贸n no v谩lida" |
|
|
|
def answer_question(text, question): |
|
messages = [ |
|
{"role": "system", "content": "Eres un asistente 煤til."}, |
|
{"role": "user", "content": f"El documento es el siguiente:\n{text}"}, |
|
{"role": "user", "content": question} |
|
] |
|
response = llm_engine_hf.invoke(messages) |
|
return response.content |
|
|
|
def download_text(output_text, filename='output.txt'): |
|
if output_text: |
|
file_path = Path(filename) |
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
f.write(output_text) |
|
return file_path |
|
else: |
|
return None |
|
|
|
def create_download_file(output_text, filename='output.txt'): |
|
file_path = download_text(output_text, filename) |
|
return str(file_path) if file_path else None |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## LexAIcon") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
file = gr.File(label="Subir un archivo") |
|
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"]) |
|
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False) |
|
summary_length = gr.Radio(label="Seleccione la longitud del resumen", choices=["Corto", "Medio", "Largo"], visible=False) |
|
question = gr.Textbox(label="Hacer una pregunta al documento", lines=1, visible=False) |
|
question_button = gr.Button("Enviar Pregunta", visible=False) |
|
|
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Resultado", lines=20) |
|
answer = gr.Textbox(label="Respuesta", lines=1, interactive=False, visible=False) |
|
|
|
def update_visible_elements(action): |
|
if action == "Traducir": |
|
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible(False)), gr.update(visible(False)) |
|
elif action == "Resumen": |
|
return gr.update(visible(False)), gr.update(visible(True)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)) |
|
elif action == "Clasificar": |
|
return gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(True)), gr.update(visible(True)), gr.update(visible(True)), gr.update(visible(True)) |
|
else: |
|
return gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)) |
|
|
|
action.change(update_visible_elements, inputs=action, outputs=[target_language, summary_length, question, question_button, output_text, answer]) |
|
|
|
submit_button = gr.Button("Procesar") |
|
submit_button.click(process_file, inputs=[file, action, target_language, summary_length], outputs=output_text) |
|
|
|
def generate_file(): |
|
summary_text = output_text.value |
|
filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt' |
|
file_path = download_text(summary_text, filename) |
|
return file_path |
|
|
|
download_button = gr.Button("Descargar Resultado") |
|
download_button.click( |
|
fn=generate_file, |
|
inputs=[], |
|
outputs=gr.File() |
|
) |
|
|
|
question_button.click(answer_question, inputs=[output_text, question], outputs=answer) |
|
|
|
|
|
demo.launch(share=True) |
|
|