Spaces:

om4r932
/

ETSISpecSplitter

Sleeping

App Files Files Community

om4r932 commited on Jul 15

Commit

219f767

1 Parent(s): 0619166

First version

Browse files

Files changed (4) hide show

Dockerfile +17 -0
app.py +111 -0
requirements.txt +8 -0
spec_indexer_multi_doc.py +327 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.9
+RUN apt-get update && \
+    apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import requests, os, re, warnings, fitz
+warnings.filterwarnings("ignore")
+from dotenv import load_dotenv
+from datasets import load_dataset
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+load_dotenv()
+app = FastAPI(title="ETSI Specification Splitter API",
+              description="API to split and display specifications by their chapters & sub-chapters",
+              docs_url="/")
+origins = [
+    "*",
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
+spec_contents = spec_contents["train"].to_list()
+def is_doc_indexed(spec_id: str):
+    return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
+def get_full_doc(spec_id: str):
+    doc = []
+    for spec in spec_contents:
+        if spec["doc_id"] == spec_id:
+            doc.append(f"{spec['section']}\n{spec['content']}")
+    return "\n\n".join(doc)
+def get_structured_doc(spec_id: str):
+    doc = {}
+    for spec in spec_contents:
+        if spec["doc_id"] == spec_id:
+            doc[spec["section"]] = spec["content"]
+    return doc
+class SpecRequest(BaseModel):
+    spec_id: str
+def get_pdf_data(request: SpecRequest):
+    specification = request.spec_id
+    if is_doc_indexed(specification):
+        return get_full_doc(specification)
+    url = requests.post(
+        "https://organizedprogrammers-etsidocfinder.hf.space/find",
+        verify=False,
+        headers={"Content-Type": "application/json"},
+        json={"doc_id": specification}
+    )
+    if url.status_code != 200:
+        raise HTTPException(404, detail="Not found")
+    url = url.json()['url']
+    response = requests.get(
+        url,
+        verify=False,
+        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
+    )
+    pdf = fitz.open(stream=response.content, filetype="pdf")
+    return pdf, pdf.get_toc()
+@app.post("/get_spec_content")
+def get_spec_content(request: SpecRequest):
+    def extract_sections(text, titles):
+        sections = {}
+        # On trie les titres selon leur position dans le texte
+        sorted_titles = sorted(titles, key=lambda t: text.find(t))
+        for i, title in enumerate(sorted_titles):
+            start = text.find(title)
+            if i + 1 < len(sorted_titles):
+                end = text.find(sorted_titles[i + 1])
+                sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
+            else:
+                sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
+        return sections
+    print("\n[INFO] Tentative de récupération du texte", flush=True)
+    pdf, doc_toc = get_pdf_data(request)
+    text = []
+    first = 0
+    for level, title, page in doc_toc:
+        if title[0].isnumeric():
+            first = page - 1
+            break
+    for page in pdf[first:]:
+        text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
+    text = "\n".join(text)
+    if not text or not doc_toc:
+        print("\n[ERREUR] Pas de texte/table of contents trouvé !")
+        return {}
+    print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
+    titles = []
+    for level, title, page in doc_toc:
+        if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
+            titles.append('\n'.join(title.strip().split(" ", 1)))
+    return extract_sections(text, titles)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn[standard]
+requests
+pydantic
+lxml
+datasets
+python-dotenv
+fitz

spec_indexer_multi_doc.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import datetime
+import time
+import sys
+import json
+import traceback
+import requests
+import zipfile
+import uuid
+import os
+import re
+import subprocess
+import concurrent.futures
+import threading
+from io import StringIO, BytesIO
+from typing import List, Dict, Any
+import pandas as pd
+import numpy as np
+import warnings
+warnings.filterwarnings("ignore")
+# Caractères pour le formatage des versions
+chars = "0123456789abcdefghijklmnopqrstuvwxyz"
+# Verrous pour les opérations thread-safe
+print_lock = threading.Lock()
+dict_lock = threading.Lock()
+scope_lock = threading.Lock()
+# Dictionnaires globaux
+indexed_specifications = {}
+documents_by_spec_num = {}
+processed_count = 0
+total_count = 0
+def get_text(specification: str, version: str):
+    """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
+    doc_id = specification
+    series = doc_id.split(".")[0]
+    response = requests.get(
+        f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
+        verify=False,
+        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+    )
+    if response.status_code != 200:
+        raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
+    zip_bytes = BytesIO(response.content)
+    with zipfile.ZipFile(zip_bytes) as zf:
+        for file_name in zf.namelist():
+            if file_name.endswith("zip"):
+                print("Another ZIP !")
+                zip_bytes = BytesIO(zf.read(file_name))
+                zf = zipfile.ZipFile(zip_bytes)
+                for file_name2 in zf.namelist():
+                    if file_name2.endswith("doc") or file_name2.endswith("docx"):
+                        if "cover" in file_name2.lower():
+                            print("COVER !")
+                            continue
+                        ext = file_name2.split(".")[-1]
+                        doc_bytes = zf.read(file_name2)
+                        temp_id = str(uuid.uuid4())
+                        input_path = f"/tmp/{temp_id}.{ext}"
+                        output_path = f"/tmp/{temp_id}.txt"
+                        with open(input_path, "wb") as f:
+                            f.write(doc_bytes)
+                        subprocess.run([
+                            "libreoffice",
+                            "--headless",
+                            "--convert-to", "txt",
+                            "--outdir", "/tmp",
+                            input_path
+                        ], check=True)
+                        with open(output_path, "r") as f:
+                            txt_data = [line.strip() for line in f if line.strip()]
+                        os.remove(input_path)
+                        os.remove(output_path)
+                        return txt_data
+            elif file_name.endswith("doc") or file_name.endswith("docx"):
+                if "cover" in file_name.lower():
+                    print("COVER !")
+                    continue
+                ext = file_name.split(".")[-1]
+                doc_bytes = zf.read(file_name)
+                temp_id = str(uuid.uuid4())
+                input_path = f"/tmp/{temp_id}.{ext}"
+                output_path = f"/tmp/{temp_id}.txt"
+                print("Ecriture")
+                with open(input_path, "wb") as f:
+                    f.write(doc_bytes)
+                print("Convertissement")
+                subprocess.run([
+                    "libreoffice",
+                    "--headless",
+                    "--convert-to", "txt",
+                    "--outdir", "/tmp",
+                    input_path
+                ], check=True)
+                print("Ecriture TXT")
+                with open(output_path, "r", encoding="utf-8") as f:
+                    txt_data = [line.strip() for line in f if line.strip()]
+                os.remove(input_path)
+                os.remove(output_path)
+                return txt_data
+    raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
+def get_spec_content(specification: str, version: str):
+    text = get_text(specification, version)
+    forewords = []
+    for x in range(len(text)):
+        line = text[x]
+        if "Foreword" in line:
+            forewords.append(x)
+        if len(forewords) >= 2:
+            break
+    toc_brut = text[forewords[0]:forewords[1]]
+    chapters = []
+    for line in toc_brut:
+        x = line.split("\t")
+        if re.search(r"^\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+    real_toc_indexes = {}
+    for chapter in chapters:
+        try:
+            x = text.index(chapter)
+            real_toc_indexes[chapter] = x
+        except ValueError as e:
+            try:
+                number = chapter.split("\t")[0] + "\t"
+                for line in text[forewords[1]:]:
+                    if number in line:
+                        x = text.index(line)
+                        real_toc_indexes[line] = x
+                        break
+            except:
+                real_toc_indexes[chapter] = -float("inf")
+    document = {}
+    toc = list(real_toc_indexes.keys())
+    index_toc = list(real_toc_indexes.values())
+    curr_index = 0
+    for x in range(1, len(toc)):
+        document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
+        curr_index = x
+    document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
+    return document
+def process_specification(spec: Dict[str, Any], columns: List[str]) -> None:
+    """Traite une spécification individuelle avec multithreading."""
+    global processed_count, indexed_specifications, documents_by_spec_num
+    try:
+        if spec.get('vers', None) is None:
+            return
+        doc_id = str(spec["spec_num"])
+        series = doc_id.split(".")[0]
+        a, b, c = str(spec["vers"]).split(".")
+        # Formatage de l'URL selon la version
+        if not (int(a) > 35 or int(b) > 35 or int(c) > 35):
+            version_code = f"{chars[int(a)]}{chars[int(b)]}{chars[int(c)]}"
+            spec_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
+        else:
+            x, y, z = str(a), str(b), str(c)
+            while len(x) < 2:
+                x = "0" + x
+            while len(y) < 2:
+                y = "0" + y
+            while len(z) < 2:
+                z = "0" + z
+            version_code = f"{x}{y}{z}"
+            spec_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
+        string = f"{spec['spec_num']}+-+{spec['title']}+-+{spec['type']}+-+{spec['vers']}+-+{spec['WG']}+-+Rel-{spec['vers'].split('.')[0]}"
+        metadata = {
+            "id": str(spec["spec_num"]),
+            "title": spec["title"],
+            "type": spec["type"],
+            "release": str(spec["vers"].split(".")[0]),
+            "version": str(spec["vers"]),
+            "working_group": spec["WG"],
+            "url": spec_url
+        }
+        # Vérification si le scope existe déjà pour ce numéro de spécification
+        spec_num = str(spec["spec_num"])
+        with scope_lock:
+            if spec_num in documents_by_spec_num:
+                # Réutilisation du scope existant
+                metadata["content"] = documents_by_spec_num[spec_num]
+                with print_lock:
+                    print(f"\nRéutilisation du document (dernier release) pour {spec_num}")
+            else:
+                # Extraction du scope seulement si nécessaire
+                if not (int(a) > 35 or int(b) > 35 or int(c) > 35):
+                    version_for_document = f"{chars[int(a)]}{chars[int(b)]}{chars[int(c)]}"
+                else:
+                    version_for_document = version_code
+                with print_lock:
+                    print(f"\nExtraction du contenu pour {spec_num} (version {version_for_document})")
+                try:
+                    document = get_spec_content(metadata["id"], version_for_document)
+                    documents_by_spec_num[spec_num] = document
+                    metadata["content"] = document
+                except Exception as e:
+                    error_msg = f"Erreur lors de l'extraction du scope: {str(e)}"
+                    metadata["content"] = error_msg
+                    documents_by_spec_num[spec_num] = error_msg
+        # Mise à jour du dictionnaire global avec verrou
+        with dict_lock:
+            indexed_specifications[string] = metadata
+            processed_count += 1
+        # Affichage de la progression avec verrou
+        with print_lock:
+            sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications")
+            sys.stdout.flush()
+    except Exception as e:
+        with print_lock:
+            print(f"\nErreur lors du traitement de {spec.get('spec_num', 'inconnu')}: {str(e)}")
+def main():
+    global total_count
+    start_time = time.time()
+    # Récupération des spécifications depuis le site 3GPP
+    print("Récupération des spécifications depuis 3GPP...")
+    response = requests.get(
+        f'https://www.3gpp.org/dynareport?code=status-report.htm',
+        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
+        verify=False
+    )
+    # Analyse des tableaux HTML
+    dfs = pd.read_html(
+        StringIO(response.text),
+        storage_options={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
+        encoding="utf-8"
+    )
+    for x in range(len(dfs)):
+        dfs[x] = dfs[x].replace({np.nan: None})
+    # Extraction des colonnes nécessaires
+    columns_needed = [0, 1, 2, 3, 4]
+    extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
+    columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
+    # Préparation des spécifications
+    specifications = []
+    for df in extracted_dfs:
+        for index, row in df.iterrows():
+            doc = row.to_list()
+            doc_dict = dict(zip(columns, doc))
+            specifications.append(doc_dict)
+    total_count = len(specifications)
+    print(f"Traitement de {total_count} spécifications avec multithreading...")
+    try:
+        # Vérification si un fichier de documents existe déjà
+        if os.path.exists("indexed_docs_content.zip"):
+            with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf:
+                for file_name in zf.namelist():
+                    if file_name.endswith(".json"):
+                        doc_bytes = zf.read(file_name)
+                        global documents_by_spec_num
+                        documents_by_spec_num = json.loads(doc_bytes.decode("utf-8"))
+                        print(f"Chargement de {len(documents_by_spec_num)} documents depuis le cache.")
+        # Utilisation de ThreadPoolExecutor pour le multithreading
+        with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
+            futures = [executor.submit(process_specification, spec, columns) for spec in specifications]
+            concurrent.futures.wait(futures)
+    finally:
+        # Sauvegarde des résultats
+        result = {
+            "specs": indexed_specifications,
+            "last_indexed_date": datetime.datetime.today().strftime("%d-%m-%Y")
+        }
+        with open("indexed_documents.json", "w", encoding="utf-8") as f:
+            json.dump(documents_by_spec_num, f, indent=4, ensure_ascii=False)
+        with open("indexed_specifications.json", "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=4, ensure_ascii=False)
+        elapsed_time = time.time() - start_time
+        print(f"\nTraitement terminé en {elapsed_time:.2f} secondes")
+        print(f"Résultats sauvegardés dans indexed_specifications.json")
+if __name__ == "__main__":
+    main()