File size: 5,951 Bytes
23a1957 7413e10 23a1957 4387fb1 23a1957 e988252 97ab717 cc439f4 4387fb1 23a1957 4387fb1 7413e10 23a1957 cc439f4 97ab717 9bd74d6 97ab717 4387fb1 23a1957 97ab717 23a1957 4387fb1 23a1957 7413e10 23a1957 7413e10 23a1957 4387fb1 cc439f4 23a1957 97ab717 cc439f4 23a1957 4387fb1 23a1957 4387fb1 23a1957 4387fb1 23a1957 97ab717 23a1957 97ab717 23a1957 97ab717 6a25acd 23a1957 97ab717 23a1957 97ab717 23a1957 ba072b0 4387fb1 23a1957 97ab717 23a1957 97ab717 23a1957 97ab717 23a1957 97ab717 ba072b0 97ab717 23a1957 97ab717 23a1957 4387fb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import PyPDF2
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
login(token=os.getenv('HUGGINGFACE_TOKEN'))
# Configuraci贸n del modelo de resumen
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
max_new_tokens=4096,
temperature=0.5,
do_sample=False,
model_kwargs={"use_auth_token": HUGGINGFACE_TOKEN} # Pasar el token como parte de los argumentos del modelo
)
llm_engine_hf = ChatHuggingFace(llm=llm)
# Configuraci贸n del modelo de clasificaci贸n
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
def read_pdf(file_path):
pdf_reader = PyPDF2.PdfReader(file_path)
text = ""
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
def summarize(file):
# Leer el contenido del archivo subido
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
template = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, identify the key points and main ideas covered in the text. Organize these key points into a concise bulleted list that summarizes the essential information from the document. The summary should have a maximum of 10 bullet points.
Your goal is to be comprehensive in capturing the core content of the document, while also being concise in how you express each summary point. Omit minor details and focus on the central themes and important facts.
'''
prompt = PromptTemplate(
template=template,
input_variables=['TEXT']
)
formatted_prompt = prompt.format(TEXT=text)
output_summary = llm_engine_hf.invoke(formatted_prompt)
return output_summary.content
def classify_text(text):
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
model.eval()
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax(dim=-1).item()
predicted_label = id2label[predicted_class_id]
return predicted_label
def translate(file, target_language):
# Leer el contenido del archivo subido
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
template = '''
Please translate the following document to {LANGUAGE}:
<document>
{TEXT}
</document>
Ensure that the translation is accurate and preserves the original meaning of the document.
'''
prompt = PromptTemplate(
template=template,
input_variables=['TEXT', 'LANGUAGE']
)
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
translated_text = llm_engine_hf.invoke(formatted_prompt)
return translated_text.content
def process_file(file, action, target_language=None):
if action == "Resumen":
return summarize(file)
elif action == "Clasificar":
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return classify_text(text)
elif action == "Traducir":
return translate(file, target_language)
else:
return "Acci贸n no v谩lida"
def download_text(output_text, filename='output.txt'):
if output_text:
file_path = Path(filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(output_text, filename='output.txt'):
file_path = download_text(output_text, filename)
return str(file_path) if file_path else None
# Crear la interfaz de Gradio
with gr.Blocks() as demo:
gr.Markdown("## Document Processor")
with gr.Row():
with gr.Column():
file = gr.File(label="Subir un archivo")
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"])
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False)
with gr.Column():
output_text = gr.Textbox(label="Resultado", lines=20)
def update_language_dropdown(action):
if action == "Traducir":
return gr.update(visible=True)
else:
return gr.update(visible=False)
action.change(update_language_dropdown, inputs=action, outputs=target_language)
submit_button = gr.Button("Procesar")
submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)
def generate_file():
summary_text = output_text.value
filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
file_path = download_text(summary_text, filename)
return file_path
download_button = gr.Button("Descargar Resultado")
download_button.click(
fn=generate_file,
inputs=[],
outputs=gr.File()
)
# Ejecutar la aplicaci贸n Gradio
demo.launch(share=True)
|