Update app.py
Browse files
app.py
CHANGED
@@ -1,69 +1,168 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
import PyPDF2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
8 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
return json.dumps(data, indent=2)
|
28 |
-
return file
|
29 |
-
|
30 |
-
def generate(mode, file, file_type, instructions, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
|
31 |
-
temperature = max(float(temperature), 1e-2)
|
32 |
-
top_p = float(top_p)
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
generate_kwargs = dict(
|
38 |
-
temperature=temperature,
|
39 |
-
max_new_tokens=max_new_tokens,
|
40 |
-
top_p=top_p,
|
41 |
-
repetition_penalty=repetition_penalty,
|
42 |
-
do_sample=True,
|
43 |
-
seed=42,
|
44 |
)
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
with gr.Blocks() as demo:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
3 |
+
|
4 |
import PyPDF2
|
5 |
+
import gradio as gr
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain.chains.summarize import load_summarize_chain
|
8 |
+
from pathlib import Path
|
9 |
+
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
10 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
11 |
+
import torch
|
12 |
|
13 |
+
# Configuraci贸n del modelo de resumen
|
14 |
+
llm = HuggingFaceEndpoint(
|
15 |
+
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
|
16 |
+
task="text-generation",
|
17 |
+
max_new_tokens=4096,
|
18 |
+
temperature=0.5,
|
19 |
+
do_sample=False,
|
20 |
)
|
21 |
+
llm_engine_hf = ChatHuggingFace(llm=llm)
|
22 |
+
|
23 |
+
# Configuraci贸n del modelo de clasificaci贸n
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
25 |
+
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
26 |
+
|
27 |
+
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
28 |
+
|
29 |
+
def read_pdf(file_path):
|
30 |
+
pdf_reader = PyPDF2.PdfReader(file_path)
|
31 |
+
text = ""
|
32 |
+
for page in range(len(pdf_reader.pages)):
|
33 |
+
text += pdf_reader.pages[page].extract_text()
|
34 |
+
return text
|
35 |
|
36 |
+
def summarize(file):
|
37 |
+
# Leer el contenido del archivo subido
|
38 |
+
file_path = file.name
|
39 |
+
if file_path.endswith('.pdf'):
|
40 |
+
text = read_pdf(file_path)
|
41 |
+
else:
|
42 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
43 |
+
text = f.read()
|
44 |
+
|
45 |
+
template = '''
|
46 |
+
Please carefully read the following document:
|
47 |
+
<document>
|
48 |
+
{TEXT}
|
49 |
+
</document>
|
50 |
+
After reading through the document, identify the key points and main ideas covered in the text. Organize these key points into a concise bulleted list that summarizes the essential information from the document. The summary should have a maximum of 10 bullet points.
|
51 |
+
Your goal is to be comprehensive in capturing the core content of the document, while also being concise in how you express each summary point. Omit minor details and focus on the central themes and important facts.
|
52 |
+
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
prompt = PromptTemplate(
|
55 |
+
template=template,
|
56 |
+
input_variables=['TEXT']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
)
|
58 |
+
|
59 |
+
formatted_prompt = prompt.format(TEXT=text)
|
60 |
+
output_summary = llm_engine_hf.invoke(formatted_prompt)
|
61 |
+
|
62 |
+
return output_summary.content
|
63 |
|
64 |
+
def classify_text(text):
|
65 |
+
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
66 |
+
model.eval()
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = model(**inputs)
|
69 |
+
logits = outputs.logits
|
70 |
+
predicted_class_id = logits.argmax(dim=-1).item()
|
71 |
+
predicted_label = id2label[predicted_class_id]
|
72 |
+
return predicted_label
|
73 |
|
74 |
+
def translate(file, target_language):
|
75 |
+
# Leer el contenido del archivo subido
|
76 |
+
file_path = file.name
|
77 |
+
if file_path.endswith('.pdf'):
|
78 |
+
text = read_pdf(file_path)
|
79 |
+
else:
|
80 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
81 |
+
text = f.read()
|
82 |
|
83 |
+
template = '''
|
84 |
+
Please translate the following document to {LANGUAGE}:
|
85 |
+
<document>
|
86 |
+
{TEXT}
|
87 |
+
</document>
|
88 |
+
Ensure that the translation is accurate and preserves the original meaning of the document.
|
89 |
+
'''
|
90 |
+
|
91 |
+
prompt = PromptTemplate(
|
92 |
+
template=template,
|
93 |
+
input_variables=['TEXT', 'LANGUAGE']
|
94 |
+
)
|
95 |
+
|
96 |
+
formatted_prompt = prompt.format(TEXT=text, LANGUAGE=target_language)
|
97 |
+
translated_text = llm_engine_hf.invoke(formatted_prompt)
|
98 |
+
|
99 |
+
return translated_text.content
|
100 |
+
|
101 |
+
def process_file(file, action, target_language=None):
|
102 |
+
if action == "Resumen":
|
103 |
+
return summarize(file)
|
104 |
+
elif action == "Clasificar":
|
105 |
+
file_path = file.name
|
106 |
+
if file_path.endswith('.pdf'):
|
107 |
+
text = read_pdf(file_path)
|
108 |
+
else:
|
109 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
110 |
+
text = f.read()
|
111 |
+
return classify_text(text)
|
112 |
+
elif action == "Traducir":
|
113 |
+
return translate(file, target_language)
|
114 |
+
else:
|
115 |
+
return "Acci贸n no v谩lida"
|
116 |
+
|
117 |
+
def download_text(output_text, filename='output.txt'):
|
118 |
+
if output_text:
|
119 |
+
file_path = Path(filename)
|
120 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
121 |
+
f.write(output_text)
|
122 |
+
return file_path
|
123 |
+
else:
|
124 |
+
return None
|
125 |
+
|
126 |
+
def create_download_file(output_text, filename='output.txt'):
|
127 |
+
file_path = download_text(output_text, filename)
|
128 |
+
return str(file_path) if file_path else None
|
129 |
+
|
130 |
+
# Crear la interfaz de Gradio
|
131 |
with gr.Blocks() as demo:
|
132 |
+
gr.Markdown("## Document Processor")
|
133 |
+
|
134 |
+
with gr.Row():
|
135 |
+
with gr.Column():
|
136 |
+
file = gr.File(label="Subir un archivo")
|
137 |
+
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"])
|
138 |
+
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False)
|
139 |
+
|
140 |
+
with gr.Column():
|
141 |
+
output_text = gr.Textbox(label="Resultado", lines=20)
|
142 |
+
|
143 |
+
def update_language_dropdown(action):
|
144 |
+
if action == "Traducir":
|
145 |
+
return gr.update(visible=True)
|
146 |
+
else:
|
147 |
+
return gr.update(visible=False)
|
148 |
|
149 |
+
action.change(update_language_dropdown, inputs=action, outputs=target_language)
|
150 |
+
|
151 |
+
submit_button = gr.Button("Procesar")
|
152 |
+
submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)
|
153 |
+
|
154 |
+
def generate_file():
|
155 |
+
summary_text = output_text.value
|
156 |
+
filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
|
157 |
+
file_path = download_text(summary_text, filename)
|
158 |
+
return file_path
|
159 |
+
|
160 |
+
download_button = gr.Button("Descargar Resultado")
|
161 |
+
download_button.click(
|
162 |
+
fn=generate_file,
|
163 |
+
inputs=[],
|
164 |
+
outputs=gr.File()
|
165 |
+
)
|
166 |
+
|
167 |
+
# Ejecutar la aplicaci贸n Gradio
|
168 |
+
demo.launch(share=True)
|