lik07 commited on
Commit
d697844
·
verified ·
1 Parent(s): e131b1b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from docx import Document
3
+ import os
4
+
5
+ def split_by_headers(file_path, headers_per_chunk=1):
6
+ doc = Document(file_path)
7
+ chunks = []
8
+ current_chunk = Document()
9
+ header_count = 0
10
+
11
+ for element in doc.element.body:
12
+ if element.tag.endswith('p'):
13
+ paragraph = element._element
14
+ if any(style.val.startswith('Heading') for style in paragraph.xpath('.//w:pStyle')):
15
+ header_count += 1
16
+ if header_count > headers_per_chunk:
17
+ chunks.append(current_chunk)
18
+ current_chunk = Document()
19
+ header_count = 1
20
+
21
+ current_chunk.element.body.append(element)
22
+
23
+ if len(current_chunk.element.body):
24
+ chunks.append(current_chunk)
25
+
26
+ return chunks
27
+
28
+ def split_by_pages(file_path, pages_per_chunk=1):
29
+ doc = Document(file_path)
30
+ chunks = []
31
+ current_chunk = Document()
32
+ page_count = 0
33
+ estimated_chars_per_page = 3000 # This is an estimation
34
+ char_count = 0
35
+
36
+ for element in doc.element.body:
37
+ if element.tag.endswith('p'):
38
+ text = element.text
39
+ char_count += len(text)
40
+
41
+ if char_count >= estimated_chars_per_page:
42
+ page_count += 1
43
+ char_count = 0
44
+
45
+ if page_count >= pages_per_chunk:
46
+ chunks.append(current_chunk)
47
+ current_chunk = Document()
48
+ page_count = 0
49
+
50
+ current_chunk.element.body.append(element)
51
+
52
+ if len(current_chunk.element.body):
53
+ chunks.append(current_chunk)
54
+
55
+ return chunks
56
+
57
+ def save_chunks(chunks, original_filename):
58
+ saved_files = []
59
+ base_name = os.path.splitext(original_filename)[0]
60
+
61
+ for i, chunk in enumerate(chunks, 1):
62
+ output_path = f"{base_name}_part{i}.docx"
63
+ chunk.save(output_path)
64
+ saved_files.append(output_path)
65
+
66
+ return saved_files
67
+
68
+ def process_document(file, split_type, headers_or_pages):
69
+ if headers_or_pages < 1:
70
+ return "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
71
+
72
+ if split_type == "Encabezados":
73
+ chunks = split_by_headers(file.name, headers_or_pages)
74
+ else: # Páginas
75
+ chunks = split_by_pages(file.name, headers_or_pages)
76
+
77
+ saved_files = save_chunks(chunks, os.path.basename(file.name))
78
+ return f"Documento dividido en {len(saved_files)} partes: {', '.join(saved_files)}"
79
+
80
+ # Interfaz Gradio
81
+ iface = gr.Interface(
82
+ fn=process_document,
83
+ inputs=[
84
+ gr.File(label="Seleccione el archivo DOCX"),
85
+ gr.Radio(["Encabezados", "Páginas"], label="Método de división"),
86
+ gr.Number(value=1, label="Número de encabezados/páginas por fragmento", minimum=1)
87
+ ],
88
+ outputs=gr.Text(label="Resultado"),
89
+ title="Divisor de Documentos DOCX",
90
+ description="Divida documentos DOCX por encabezados o páginas estimadas"
91
+ )
92
+
93
+ if __name__ == "__main__":
94
+ iface.launch()