|
import warnings |
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
import PyPDF2 |
|
import gradio as gr |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chains.summarize import load_summarize_chain |
|
from huggingface_hub import login |
|
from pathlib import Path |
|
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import os |
|
|
|
login(token=os.getenv('HUGGINGFACE_TOKEN')) |
|
|
|
|
|
llm = HuggingFaceEndpoint( |
|
repo_id="mistralai/Mistral-7B-Instruct-v0.3", |
|
task="text-generation", |
|
max_new_tokens=4096, |
|
temperature=0.5, |
|
do_sample=False, |
|
) |
|
llm_engine_hf = ChatHuggingFace(llm=llm) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") |
|
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish") |
|
|
|
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"} |
|
|
|
def read_pdf(file_path): |
|
pdf_reader = PyPDF2.PdfReader(file_path) |
|
text = "" |
|
for page in range(len(pdf_reader.pages)): |
|
text += pdf_reader.pages[page].extract_text() |
|
return text |
|
|
|
def summarize(file): |
|
|
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
|
|
template = ''' |
|
Por favor, lea detenidamente el siguiente documento: |
|
<document> |
|
{TEXT} |
|
</document> |
|
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. Organice estos puntos clave en una lista con vi帽etas concisa que resuma la informaci贸n esencial del documento. El resumen debe tener un m谩ximo de 10 puntos. |
|
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes. |
|
''' |
|
|
|
prompt = PromptTemplate( |
|
template=template, |
|
input_variables=['TEXT'] |
|
) |
|
|
|
formatted_prompt = prompt.format(TEXT=text) |
|
output_summary = llm_engine_hf.invoke(formatted_prompt) |
|
|
|
return f"Prompt:\n{formatted_prompt}\n\nResumen:\n{output_summary.content}" |
|
|
|
def classify_text(text): |
|
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length") |
|
model.eval() |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class_id = logits.argmax(dim=-1).item() |
|
predicted_label = id2label[predicted_class_id] |
|
return predicted_label |
|
|
|
def translate(file, target_language): |
|
|
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
|
|
template = ''' |
|
Por favor, traduzca el siguiente documento al {LANGUAGE}: |
|
<document> |
|
{TEXT} |
|
</document> |
|
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento. |
|
''' |
|
|
|
prompt = PromptTemplate( |
|
template=template, |
|
input_variables=['TEXT', 'LANGUAGE'] |
|
) |
|
|
|
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language) |
|
translated_text = llm_engine_hf.invoke(formatted_prompt) |
|
|
|
return f"Prompt:\n{formatted_prompt}\n\nTraducci贸n:\n{translated_text.content}" |
|
|
|
def process_file(file, action, target_language=None): |
|
if action == "Resumen": |
|
return summarize(file) |
|
elif action == "Clasificar": |
|
file_path = file.name |
|
if file_path.endswith('.pdf'): |
|
text = read_pdf(file_path) |
|
else: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
return classify_text(text) |
|
elif action == "Traducir": |
|
return translate(file, target_language) |
|
else: |
|
return "Acci贸n no v谩lida" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Procesador de Documentos") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
file = gr.File(label="Subir un archivo") |
|
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"]) |
|
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False) |
|
|
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Resultado", lines=20) |
|
|
|
def update_language_dropdown(action): |
|
if action == "Traducir": |
|
return gr.update(visible=True) |
|
else: |
|
return gr.update(visible=False) |
|
|
|
action.change(update_language_dropdown, inputs=action, outputs=target_language) |
|
|
|
submit_button = gr.Button("Procesar") |
|
submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text) |
|
|
|
|
|
demo.launch(share=True) |