File size: 8,059 Bytes
23a1957 7413e10 23a1957 dcd74ef 23a1957 e988252 cdd50f7 cc439f4 a80aa7b 23a1957 7413e10 23a1957 cc439f4 a80aa7b 23a1957 a80aa7b f74fdd9 a80aa7b f74fdd9 a80aa7b f74fdd9 a80aa7b 23a1957 a80aa7b 23a1957 a80aa7b 23a1957 7413e10 23a1957 7413e10 23a1957 a80aa7b cc439f4 23a1957 f74fdd9 cc439f4 23a1957 7413e10 23a1957 a80aa7b 23a1957 a80aa7b 23a1957 a80aa7b 23a1957 a80aa7b 23a1957 a80aa7b 23a1957 f74fdd9 23a1957 a80aa7b 23a1957 ba072b0 f74fdd9 23a1957 a80aa7b 0d6e3a2 23a1957 0d6e3a2 23a1957 a80aa7b 23a1957 0d6e3a2 a80aa7b 0d6e3a2 a80aa7b 0d6e3a2 23a1957 0d6e3a2 ba072b0 0d6e3a2 23a1957 a80aa7b 23a1957 0d6e3a2 23a1957 a80aa7b 23a1957 f74fdd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from huggingface_hub import login
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
login(token=os.getenv('HUGGINGFACE_TOKEN'))
# Configuraci贸n del modelo LLM
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
max_new_tokens=4096,
temperature=0.5,
do_sample=False,
)
llm_engine_hf = ChatHuggingFace(llm=llm)
# Configuraci贸n del modelo de clasificaci贸n
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
def read_pdf(file_path):
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
def summarize(file, summary_length):
# Leer el contenido del archivo subido
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
if summary_length == 'Corto':
length_instruction = "El resumen debe tener un m谩ximo de 100 palabras."
elif summary_length == 'Medio':
length_instruction = "El resumen debe tener un m谩ximo de 500 palabras."
else:
length_instruction = "El resumen debe tener un m谩ximo de 1000 palabras."
template = f'''
Por favor, lea detenidamente el siguiente documento:
<document>
{{TEXT}}
</document>
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. Organice estos puntos clave en una lista con vi帽etas concisa que resuma la informaci贸n esencial del documento. {length_instruction}
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes.
'''
prompt = PromptTemplate(
template=template,
input_variables=['TEXT']
)
formatted_prompt = prompt.format(TEXT=text)
output_summary = llm_engine_hf.invoke(formatted_prompt)
return f"Prompt:\n{formatted_prompt}\n\nResumen:\n{output_summary.content}"
def classify_text(text):
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
model.eval()
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax(dim=-1).item()
predicted_label = id2label[predicted_class_id]
return predicted_label, text
def translate(file, target_language):
# Leer el contenido del archivo subido
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
template = '''
Por favor, traduzca el siguiente documento al {LANGUAGE}:
<document>
{TEXT}
</document>
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento.
'''
prompt = PromptTemplate(
template=template,
input_variables=['TEXT', 'LANGUAGE']
)
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
translated_text = llm_engine_hf.invoke(formatted_prompt)
return f"Prompt:\n{formatted_prompt}\n\nTraducci贸n:\n{translated_text.content}"
def process_file(file, action, target_language=None, summary_length=None):
if action == "Resumen":
return summarize(file, summary_length)
elif action == "Clasificar":
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
label, document_text = classify_text(text)
return f"Clasificaci贸n: {label}\n\nDocumento:\n{document_text}"
elif action == "Traducir":
return translate(file, target_language)
else:
return "Acci贸n no v谩lida"
def answer_question(text, question):
messages = [
{"role": "system", "content": "Eres un asistente 煤til."},
{"role": "user", "content": f"El documento es el siguiente:\n{text}"},
{"role": "user", "content": question}
]
response = llm_engine_hf.invoke(messages)
return response.content
def download_text(output_text, filename='output.txt'):
if output_text:
file_path = Path(filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(output_text, filename='output.txt'):
file_path = download_text(output_text, filename)
return str(file_path) if file_path else None
# Crear la interfaz de Gradio
with gr.Blocks() as demo:
gr.Markdown("## LexAIcon")
with gr.Row():
with gr.Column():
file = gr.File(label="Subir un archivo")
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"])
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False)
summary_length = gr.Radio(label="Seleccione la longitud del resumen", choices=["Corto", "Medio", "Largo"], visible=False)
question = gr.Textbox(label="Hacer una pregunta al documento", lines=1, visible=False)
question_button = gr.Button("Enviar Pregunta", visible=False)
with gr.Column():
output_text = gr.Textbox(label="Resultado", lines=20)
answer = gr.Textbox(label="Respuesta", lines=1, interactive=False, visible=False)
def update_visible_elements(action):
if action == "Traducir":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible(False)), gr.update(visible(False))
elif action == "Resumen":
return gr.update(visible(False)), gr.update(visible(True)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False))
elif action == "Clasificar":
return gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(True)), gr.update(visible(True)), gr.update(visible(True)), gr.update(visible(True))
else:
return gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False)), gr.update(visible(False))
action.change(update_visible_elements, inputs=action, outputs=[target_language, summary_length, question, question_button, output_text, answer])
submit_button = gr.Button("Procesar")
submit_button.click(process_file, inputs=[file, action, target_language, summary_length], outputs=output_text)
def generate_file():
summary_text = output_text.value
filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
file_path = download_text(summary_text, filename)
return file_path
download_button = gr.Button("Descargar Resultado")
download_button.click(
fn=generate_file,
inputs=[],
outputs=gr.File()
)
question_button.click(answer_question, inputs=[output_text, question], outputs=answer)
# Ejecutar la aplicaci贸n Gradio
demo.launch(share=True)
|