om4r932 commited on
Commit
219f767
·
1 Parent(s): 0619166

First version

Browse files
Files changed (4) hide show
  1. Dockerfile +17 -0
  2. app.py +111 -0
  3. requirements.txt +8 -0
  4. spec_indexer_multi_doc.py +327 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
5
+ apt-get clean && rm -rf /var/lib/apt/lists/*
6
+
7
+ RUN useradd -m -u 1000 user
8
+ USER user
9
+ ENV PATH="/home/user/.local/bin:$PATH"
10
+
11
+ WORKDIR /app
12
+
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, os, re, warnings, fitz
2
+ warnings.filterwarnings("ignore")
3
+ from dotenv import load_dotenv
4
+ from datasets import load_dataset
5
+ from fastapi import FastAPI, HTTPException
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+
9
+ load_dotenv()
10
+
11
+ app = FastAPI(title="ETSI Specification Splitter API",
12
+ description="API to split and display specifications by their chapters & sub-chapters",
13
+ docs_url="/")
14
+
15
+ origins = [
16
+ "*",
17
+ ]
18
+
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=origins,
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
28
+ spec_contents = spec_contents["train"].to_list()
29
+
30
+ def is_doc_indexed(spec_id: str):
31
+ return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
32
+
33
+ def get_full_doc(spec_id: str):
34
+ doc = []
35
+ for spec in spec_contents:
36
+ if spec["doc_id"] == spec_id:
37
+ doc.append(f"{spec['section']}\n{spec['content']}")
38
+ return "\n\n".join(doc)
39
+
40
+ def get_structured_doc(spec_id: str):
41
+ doc = {}
42
+ for spec in spec_contents:
43
+ if spec["doc_id"] == spec_id:
44
+ doc[spec["section"]] = spec["content"]
45
+ return doc
46
+
47
+
48
+ class SpecRequest(BaseModel):
49
+ spec_id: str
50
+
51
+ def get_pdf_data(request: SpecRequest):
52
+ specification = request.spec_id
53
+ if is_doc_indexed(specification):
54
+ return get_full_doc(specification)
55
+ url = requests.post(
56
+ "https://organizedprogrammers-etsidocfinder.hf.space/find",
57
+ verify=False,
58
+ headers={"Content-Type": "application/json"},
59
+ json={"doc_id": specification}
60
+ )
61
+
62
+ if url.status_code != 200:
63
+ raise HTTPException(404, detail="Not found")
64
+
65
+ url = url.json()['url']
66
+ response = requests.get(
67
+ url,
68
+ verify=False,
69
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
70
+
71
+ )
72
+
73
+ pdf = fitz.open(stream=response.content, filetype="pdf")
74
+ return pdf, pdf.get_toc()
75
+
76
+ @app.post("/get_spec_content")
77
+ def get_spec_content(request: SpecRequest):
78
+ def extract_sections(text, titles):
79
+ sections = {}
80
+ # On trie les titres selon leur position dans le texte
81
+ sorted_titles = sorted(titles, key=lambda t: text.find(t))
82
+ for i, title in enumerate(sorted_titles):
83
+ start = text.find(title)
84
+ if i + 1 < len(sorted_titles):
85
+ end = text.find(sorted_titles[i + 1])
86
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
87
+ else:
88
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
89
+ return sections
90
+ print("\n[INFO] Tentative de récupération du texte", flush=True)
91
+ pdf, doc_toc = get_pdf_data(request)
92
+ text = []
93
+ first = 0
94
+ for level, title, page in doc_toc:
95
+ if title[0].isnumeric():
96
+ first = page - 1
97
+ break
98
+ for page in pdf[first:]:
99
+ text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
100
+ text = "\n".join(text)
101
+
102
+ if not text or not doc_toc:
103
+ print("\n[ERREUR] Pas de texte/table of contents trouvé !")
104
+ return {}
105
+ print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
106
+ titles = []
107
+ for level, title, page in doc_toc:
108
+ if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
109
+ titles.append('\n'.join(title.strip().split(" ", 1)))
110
+
111
+ return extract_sections(text, titles)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ requests
4
+ pydantic
5
+ lxml
6
+ datasets
7
+ python-dotenv
8
+ fitz
spec_indexer_multi_doc.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import time
3
+ import sys
4
+ import json
5
+ import traceback
6
+ import requests
7
+ import zipfile
8
+ import uuid
9
+ import os
10
+ import re
11
+ import subprocess
12
+ import concurrent.futures
13
+ import threading
14
+ from io import StringIO, BytesIO
15
+ from typing import List, Dict, Any
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+ import warnings
20
+
21
+ warnings.filterwarnings("ignore")
22
+
23
+ # Caractères pour le formatage des versions
24
+ chars = "0123456789abcdefghijklmnopqrstuvwxyz"
25
+
26
+ # Verrous pour les opérations thread-safe
27
+ print_lock = threading.Lock()
28
+ dict_lock = threading.Lock()
29
+ scope_lock = threading.Lock()
30
+
31
+ # Dictionnaires globaux
32
+ indexed_specifications = {}
33
+ documents_by_spec_num = {}
34
+ processed_count = 0
35
+ total_count = 0
36
+
37
+ def get_text(specification: str, version: str):
38
+ """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
39
+ doc_id = specification
40
+ series = doc_id.split(".")[0]
41
+
42
+ response = requests.get(
43
+ f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
44
+ verify=False,
45
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
46
+ )
47
+
48
+ if response.status_code != 200:
49
+ raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
50
+
51
+ zip_bytes = BytesIO(response.content)
52
+
53
+ with zipfile.ZipFile(zip_bytes) as zf:
54
+ for file_name in zf.namelist():
55
+ if file_name.endswith("zip"):
56
+ print("Another ZIP !")
57
+ zip_bytes = BytesIO(zf.read(file_name))
58
+ zf = zipfile.ZipFile(zip_bytes)
59
+ for file_name2 in zf.namelist():
60
+ if file_name2.endswith("doc") or file_name2.endswith("docx"):
61
+ if "cover" in file_name2.lower():
62
+ print("COVER !")
63
+ continue
64
+ ext = file_name2.split(".")[-1]
65
+ doc_bytes = zf.read(file_name2)
66
+ temp_id = str(uuid.uuid4())
67
+ input_path = f"/tmp/{temp_id}.{ext}"
68
+ output_path = f"/tmp/{temp_id}.txt"
69
+
70
+ with open(input_path, "wb") as f:
71
+ f.write(doc_bytes)
72
+
73
+ subprocess.run([
74
+ "libreoffice",
75
+ "--headless",
76
+ "--convert-to", "txt",
77
+ "--outdir", "/tmp",
78
+ input_path
79
+ ], check=True)
80
+
81
+ with open(output_path, "r") as f:
82
+ txt_data = [line.strip() for line in f if line.strip()]
83
+
84
+ os.remove(input_path)
85
+ os.remove(output_path)
86
+ return txt_data
87
+ elif file_name.endswith("doc") or file_name.endswith("docx"):
88
+ if "cover" in file_name.lower():
89
+ print("COVER !")
90
+ continue
91
+ ext = file_name.split(".")[-1]
92
+ doc_bytes = zf.read(file_name)
93
+ temp_id = str(uuid.uuid4())
94
+ input_path = f"/tmp/{temp_id}.{ext}"
95
+ output_path = f"/tmp/{temp_id}.txt"
96
+
97
+ print("Ecriture")
98
+ with open(input_path, "wb") as f:
99
+ f.write(doc_bytes)
100
+
101
+ print("Convertissement")
102
+ subprocess.run([
103
+ "libreoffice",
104
+ "--headless",
105
+ "--convert-to", "txt",
106
+ "--outdir", "/tmp",
107
+ input_path
108
+ ], check=True)
109
+
110
+ print("Ecriture TXT")
111
+ with open(output_path, "r", encoding="utf-8") as f:
112
+ txt_data = [line.strip() for line in f if line.strip()]
113
+
114
+ os.remove(input_path)
115
+ os.remove(output_path)
116
+ return txt_data
117
+
118
+ raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
119
+
120
+ def get_spec_content(specification: str, version: str):
121
+ text = get_text(specification, version)
122
+ forewords = []
123
+ for x in range(len(text)):
124
+ line = text[x]
125
+ if "Foreword" in line:
126
+ forewords.append(x)
127
+ if len(forewords) >= 2:
128
+ break
129
+
130
+ toc_brut = text[forewords[0]:forewords[1]]
131
+ chapters = []
132
+ for line in toc_brut:
133
+ x = line.split("\t")
134
+ if re.search(r"^\d+\t[\ \S]+", line):
135
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
136
+ if re.search(r"^\d+\.\d+\t[\ \S]+", line):
137
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
138
+ if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
139
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
140
+ if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
141
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
142
+ if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
143
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
144
+
145
+ real_toc_indexes = {}
146
+
147
+ for chapter in chapters:
148
+ try:
149
+ x = text.index(chapter)
150
+ real_toc_indexes[chapter] = x
151
+ except ValueError as e:
152
+ try:
153
+ number = chapter.split("\t")[0] + "\t"
154
+ for line in text[forewords[1]:]:
155
+ if number in line:
156
+ x = text.index(line)
157
+ real_toc_indexes[line] = x
158
+ break
159
+ except:
160
+ real_toc_indexes[chapter] = -float("inf")
161
+
162
+ document = {}
163
+ toc = list(real_toc_indexes.keys())
164
+ index_toc = list(real_toc_indexes.values())
165
+ curr_index = 0
166
+ for x in range(1, len(toc)):
167
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
168
+ curr_index = x
169
+
170
+ document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
171
+ return document
172
+
173
+ def process_specification(spec: Dict[str, Any], columns: List[str]) -> None:
174
+ """Traite une spécification individuelle avec multithreading."""
175
+ global processed_count, indexed_specifications, documents_by_spec_num
176
+
177
+ try:
178
+ if spec.get('vers', None) is None:
179
+ return
180
+
181
+ doc_id = str(spec["spec_num"])
182
+ series = doc_id.split(".")[0]
183
+
184
+ a, b, c = str(spec["vers"]).split(".")
185
+
186
+ # Formatage de l'URL selon la version
187
+ if not (int(a) > 35 or int(b) > 35 or int(c) > 35):
188
+ version_code = f"{chars[int(a)]}{chars[int(b)]}{chars[int(c)]}"
189
+ spec_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
190
+ else:
191
+ x, y, z = str(a), str(b), str(c)
192
+ while len(x) < 2:
193
+ x = "0" + x
194
+ while len(y) < 2:
195
+ y = "0" + y
196
+ while len(z) < 2:
197
+ z = "0" + z
198
+ version_code = f"{x}{y}{z}"
199
+ spec_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
200
+
201
+ string = f"{spec['spec_num']}+-+{spec['title']}+-+{spec['type']}+-+{spec['vers']}+-+{spec['WG']}+-+Rel-{spec['vers'].split('.')[0]}"
202
+
203
+ metadata = {
204
+ "id": str(spec["spec_num"]),
205
+ "title": spec["title"],
206
+ "type": spec["type"],
207
+ "release": str(spec["vers"].split(".")[0]),
208
+ "version": str(spec["vers"]),
209
+ "working_group": spec["WG"],
210
+ "url": spec_url
211
+ }
212
+
213
+ # Vérification si le scope existe déjà pour ce numéro de spécification
214
+ spec_num = str(spec["spec_num"])
215
+
216
+ with scope_lock:
217
+ if spec_num in documents_by_spec_num:
218
+ # Réutilisation du scope existant
219
+ metadata["content"] = documents_by_spec_num[spec_num]
220
+ with print_lock:
221
+ print(f"\nRéutilisation du document (dernier release) pour {spec_num}")
222
+ else:
223
+ # Extraction du scope seulement si nécessaire
224
+ if not (int(a) > 35 or int(b) > 35 or int(c) > 35):
225
+ version_for_document = f"{chars[int(a)]}{chars[int(b)]}{chars[int(c)]}"
226
+ else:
227
+ version_for_document = version_code
228
+
229
+ with print_lock:
230
+ print(f"\nExtraction du contenu pour {spec_num} (version {version_for_document})")
231
+
232
+ try:
233
+ document = get_spec_content(metadata["id"], version_for_document)
234
+ documents_by_spec_num[spec_num] = document
235
+ metadata["content"] = document
236
+ except Exception as e:
237
+ error_msg = f"Erreur lors de l'extraction du scope: {str(e)}"
238
+ metadata["content"] = error_msg
239
+ documents_by_spec_num[spec_num] = error_msg
240
+
241
+ # Mise à jour du dictionnaire global avec verrou
242
+ with dict_lock:
243
+ indexed_specifications[string] = metadata
244
+ processed_count += 1
245
+
246
+ # Affichage de la progression avec verrou
247
+ with print_lock:
248
+ sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications")
249
+ sys.stdout.flush()
250
+
251
+ except Exception as e:
252
+ with print_lock:
253
+ print(f"\nErreur lors du traitement de {spec.get('spec_num', 'inconnu')}: {str(e)}")
254
+
255
+ def main():
256
+ global total_count
257
+ start_time = time.time()
258
+
259
+ # Récupération des spécifications depuis le site 3GPP
260
+ print("Récupération des spécifications depuis 3GPP...")
261
+ response = requests.get(
262
+ f'https://www.3gpp.org/dynareport?code=status-report.htm',
263
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
264
+ verify=False
265
+ )
266
+
267
+ # Analyse des tableaux HTML
268
+ dfs = pd.read_html(
269
+ StringIO(response.text),
270
+ storage_options={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
271
+ encoding="utf-8"
272
+ )
273
+
274
+ for x in range(len(dfs)):
275
+ dfs[x] = dfs[x].replace({np.nan: None})
276
+
277
+ # Extraction des colonnes nécessaires
278
+ columns_needed = [0, 1, 2, 3, 4]
279
+ extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
280
+ columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
281
+
282
+ # Préparation des spécifications
283
+ specifications = []
284
+ for df in extracted_dfs:
285
+ for index, row in df.iterrows():
286
+ doc = row.to_list()
287
+ doc_dict = dict(zip(columns, doc))
288
+ specifications.append(doc_dict)
289
+
290
+ total_count = len(specifications)
291
+ print(f"Traitement de {total_count} spécifications avec multithreading...")
292
+
293
+ try:
294
+ # Vérification si un fichier de documents existe déjà
295
+ if os.path.exists("indexed_docs_content.zip"):
296
+ with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf:
297
+ for file_name in zf.namelist():
298
+ if file_name.endswith(".json"):
299
+ doc_bytes = zf.read(file_name)
300
+ global documents_by_spec_num
301
+ documents_by_spec_num = json.loads(doc_bytes.decode("utf-8"))
302
+ print(f"Chargement de {len(documents_by_spec_num)} documents depuis le cache.")
303
+
304
+ # Utilisation de ThreadPoolExecutor pour le multithreading
305
+ with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
306
+ futures = [executor.submit(process_specification, spec, columns) for spec in specifications]
307
+ concurrent.futures.wait(futures)
308
+
309
+ finally:
310
+ # Sauvegarde des résultats
311
+ result = {
312
+ "specs": indexed_specifications,
313
+ "last_indexed_date": datetime.datetime.today().strftime("%d-%m-%Y")
314
+ }
315
+
316
+ with open("indexed_documents.json", "w", encoding="utf-8") as f:
317
+ json.dump(documents_by_spec_num, f, indent=4, ensure_ascii=False)
318
+
319
+ with open("indexed_specifications.json", "w", encoding="utf-8") as f:
320
+ json.dump(result, f, indent=4, ensure_ascii=False)
321
+
322
+ elapsed_time = time.time() - start_time
323
+ print(f"\nTraitement terminé en {elapsed_time:.2f} secondes")
324
+ print(f"Résultats sauvegardés dans indexed_specifications.json")
325
+
326
+ if __name__ == "__main__":
327
+ main()