Omar ID EL MOUMEN
commited on
Commit
·
dab8149
1
Parent(s):
714cb03
Remove indexer options + add scope extractor functions + implemented on single document search
Browse files- app.py +72 -2
- static/script.js +2 -20
- static/style.css +0 -4
- templates/index.html +0 -9
app.py
CHANGED
@@ -5,9 +5,15 @@ import requests
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import json
|
7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import time
|
9 |
from datetime import datetime
|
10 |
-
import traceback
|
11 |
from dotenv import load_dotenv
|
12 |
import warnings
|
13 |
from fastapi import FastAPI, HTTPException
|
@@ -38,6 +44,63 @@ app.add_middleware(
|
|
38 |
allow_headers=["*"],
|
39 |
)
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
class DocRequest(BaseModel):
|
42 |
doc_id: str
|
43 |
release: Optional[int] = None
|
@@ -45,6 +108,7 @@ class DocRequest(BaseModel):
|
|
45 |
class DocResponse(BaseModel):
|
46 |
doc_id: str
|
47 |
url: str
|
|
|
48 |
search_time: float
|
49 |
|
50 |
class BatchDocRequest(BaseModel):
|
@@ -294,6 +358,7 @@ def search_spec(request: KeywordRequest):
|
|
294 |
)
|
295 |
else:
|
296 |
raise HTTPException(status_code=404, detail="Specification not found")
|
|
|
297 |
@app.post("/find", response_model=DocResponse)
|
298 |
def find_document(request: DocRequest):
|
299 |
start_time = time.time()
|
@@ -301,13 +366,18 @@ def find_document(request: DocRequest):
|
|
301 |
print(finder)
|
302 |
|
303 |
result = finder.search_document(request.doc_id, request.release)
|
304 |
-
print(result)
|
305 |
|
306 |
if "not found" not in result and "Could not" not in result and "Unable" not in result:
|
|
|
307 |
return DocResponse(
|
308 |
doc_id=request.doc_id,
|
309 |
url=result,
|
310 |
search_time=time.time() - start_time
|
|
|
|
|
|
|
|
|
|
|
311 |
)
|
312 |
else:
|
313 |
raise HTTPException(status_code=404, detail=result)
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import json
|
7 |
import os
|
8 |
+
import pymupdf as fitz
|
9 |
+
import uuid
|
10 |
+
import zipfile
|
11 |
+
import io
|
12 |
+
import subprocess
|
13 |
+
import os
|
14 |
+
import re
|
15 |
import time
|
16 |
from datetime import datetime
|
|
|
17 |
from dotenv import load_dotenv
|
18 |
import warnings
|
19 |
from fastapi import FastAPI, HTTPException
|
|
|
44 |
allow_headers=["*"],
|
45 |
)
|
46 |
|
47 |
+
def get_pdf_bytes(specification: str, version: str):
|
48 |
+
doc_id = specification
|
49 |
+
series = doc_id.split(".")[0]
|
50 |
+
response = requests.get(f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip", verify=False)
|
51 |
+
if response.status_code != 200:
|
52 |
+
raise Exception("Téléchargement du ZIP échoué")
|
53 |
+
|
54 |
+
zip_bytes = io.BytesIO(response.content)
|
55 |
+
|
56 |
+
with zipfile.ZipFile(zip_bytes) as zf:
|
57 |
+
for file_name in zf.namelist():
|
58 |
+
if file_name.endswith("doc") or file_name.endswith("docx"):
|
59 |
+
ext = file_name.split(".")[-1]
|
60 |
+
doc_bytes = zf.read(file_name)
|
61 |
+
temp_id = str(uuid.uuid4())
|
62 |
+
input_path = f"/tmp/{temp_id}.{ext}"
|
63 |
+
output_path = f"/tmp/{temp_id}.pdf"
|
64 |
+
|
65 |
+
with open(input_path, "wb") as f:
|
66 |
+
f.write(doc_bytes)
|
67 |
+
|
68 |
+
subprocess.run([
|
69 |
+
"libreoffice",
|
70 |
+
"--headless",
|
71 |
+
"--convert-to", "pdf",
|
72 |
+
"--outdir", "/tmp",
|
73 |
+
input_path
|
74 |
+
], check=True)
|
75 |
+
|
76 |
+
with open(output_path, "rb") as f:
|
77 |
+
pdf_data = f.read()
|
78 |
+
|
79 |
+
os.remove(input_path)
|
80 |
+
os.remove(output_path)
|
81 |
+
|
82 |
+
return io.BytesIO(pdf_data)
|
83 |
+
raise Exception("Aucun fichier .doc/.docx trouvé dans le ZIP")
|
84 |
+
|
85 |
+
def get_scope(specification: str, version: str):
|
86 |
+
pdf_bytes = get_pdf_bytes(specification, version)
|
87 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
88 |
+
|
89 |
+
for content in doc.get_toc():
|
90 |
+
if "scope" in content[1].lower():
|
91 |
+
page_num = content[2] - 1
|
92 |
+
break
|
93 |
+
|
94 |
+
doc = doc[page_num:]
|
95 |
+
|
96 |
+
pdf_full_text = " ".join(page.get_text("text") for page in doc)
|
97 |
+
pdf_postprocess_text = re.sub(r"\s+", " ", pdf_full_text)
|
98 |
+
pdf_postprocess_text = pdf_postprocess_text.replace("1 Scope", " !-! ")
|
99 |
+
pdf_postprocess_text = pdf_postprocess_text.replace("2 Reference", " !-! ")
|
100 |
+
pdf_postprocess_text = pdf_postprocess_text.replace("", "- ")
|
101 |
+
|
102 |
+
return pdf_postprocess_text.split(" !-! ")[1]
|
103 |
+
|
104 |
class DocRequest(BaseModel):
|
105 |
doc_id: str
|
106 |
release: Optional[int] = None
|
|
|
108 |
class DocResponse(BaseModel):
|
109 |
doc_id: str
|
110 |
url: str
|
111 |
+
scope: Optional[str] = None
|
112 |
search_time: float
|
113 |
|
114 |
class BatchDocRequest(BaseModel):
|
|
|
358 |
)
|
359 |
else:
|
360 |
raise HTTPException(status_code=404, detail="Specification not found")
|
361 |
+
|
362 |
@app.post("/find", response_model=DocResponse)
|
363 |
def find_document(request: DocRequest):
|
364 |
start_time = time.time()
|
|
|
366 |
print(finder)
|
367 |
|
368 |
result = finder.search_document(request.doc_id, request.release)
|
|
|
369 |
|
370 |
if "not found" not in result and "Could not" not in result and "Unable" not in result:
|
371 |
+
version = result.split("/")[-1].replace(".zip", "").split("-")[-1]
|
372 |
return DocResponse(
|
373 |
doc_id=request.doc_id,
|
374 |
url=result,
|
375 |
search_time=time.time() - start_time
|
376 |
+
) if isinstance(finder, TsgDocFinder) else DocResponse(
|
377 |
+
doc_id=request.doc_id,
|
378 |
+
url=result,
|
379 |
+
search_time=time.time() - start_time,
|
380 |
+
scope=get_scope(request.doc_id, version)
|
381 |
)
|
382 |
else:
|
383 |
raise HTTPException(status_code=404, detail=result)
|
static/script.js
CHANGED
@@ -2,12 +2,10 @@
|
|
2 |
const singleModeBtn = document.getElementById('single-mode-btn');
|
3 |
const batchModeBtn = document.getElementById('batch-mode-btn');
|
4 |
const keywordModeBtn = document.getElementById("keyword-mode-btn");
|
5 |
-
const indexerModeBtn = document.getElementById("indexer-mode-btn")
|
6 |
|
7 |
const singleInput = document.querySelector('.single-input');
|
8 |
const batchInput = document.querySelector('.batch-input');
|
9 |
const keywordSearchInput = document.querySelector(".keyword-input");
|
10 |
-
const indexerButtons = document.querySelector(".indexer-buttons")
|
11 |
|
12 |
const docIdInput = document.getElementById('doc-id');
|
13 |
const batchIdsInput = document.getElementById('batch-ids');
|
@@ -28,23 +26,19 @@ singleModeBtn.addEventListener('click', () => {
|
|
28 |
singleModeBtn.classList.add('active');
|
29 |
keywordModeBtn.classList.remove("active");
|
30 |
batchModeBtn.classList.remove('active');
|
31 |
-
indexerModeBtn.classList.remove("active");
|
32 |
|
33 |
singleInput.style.display = 'block';
|
34 |
batchInput.style.display = 'none';
|
35 |
keywordSearchInput.style.display = "none";
|
36 |
-
indexerButtons.style.display = "none";
|
37 |
});
|
38 |
|
39 |
batchModeBtn.addEventListener('click', () => {
|
40 |
batchModeBtn.classList.add('active');
|
41 |
keywordModeBtn.classList.remove("active");
|
42 |
singleModeBtn.classList.remove('active');
|
43 |
-
indexerModeBtn.classList.remove("active");
|
44 |
|
45 |
batchInput.style.display = 'block';
|
46 |
keywordSearchInput.style.display = "none";
|
47 |
-
indexerButtons.style.display = "none";
|
48 |
singleInput.style.display = 'none';
|
49 |
});
|
50 |
|
@@ -52,26 +46,12 @@ keywordModeBtn.addEventListener('click', () => {
|
|
52 |
keywordModeBtn.classList.add("active");
|
53 |
singleModeBtn.classList.remove('active');
|
54 |
batchModeBtn.classList.remove("active");
|
55 |
-
indexerModeBtn.classList.remove("active");
|
56 |
|
57 |
singleInput.style.display = "none";
|
58 |
batchInput.style.display = "none";
|
59 |
-
indexerButtons.style.display = "none";
|
60 |
keywordSearchInput.style.display = "block";
|
61 |
})
|
62 |
|
63 |
-
indexerModeBtn.addEventListener('click', ()=>{
|
64 |
-
keywordModeBtn.classList.remove("active");
|
65 |
-
singleModeBtn.classList.remove('active');
|
66 |
-
batchModeBtn.classList.remove("active");
|
67 |
-
indexerModeBtn.classList.add("active");
|
68 |
-
|
69 |
-
singleInput.style.display = "none";
|
70 |
-
batchInput.style.display = "none";
|
71 |
-
indexerButtons.style.display = "block";
|
72 |
-
keywordSearchInput.style.display = "none";
|
73 |
-
})
|
74 |
-
|
75 |
keywordSearchBtn.addEventListener("click", async ()=>{
|
76 |
const keywords = keywordInput.value.trim();
|
77 |
if (!keywords) {
|
@@ -191,6 +171,7 @@ function displaySingleResult(data) {
|
|
191 |
|
192 |
const resultItem = document.createElement('div');
|
193 |
resultItem.className = 'result-item';
|
|
|
194 |
resultItem.innerHTML = `
|
195 |
<div class="result-header">
|
196 |
<div class="result-id">${data.doc_id}</div>
|
@@ -198,6 +179,7 @@ function displaySingleResult(data) {
|
|
198 |
</div>
|
199 |
<div class="result-url">
|
200 |
<a href="${data.url}" target="_blank">${data.url}</a>
|
|
|
201 |
</div>
|
202 |
`;
|
203 |
|
|
|
2 |
const singleModeBtn = document.getElementById('single-mode-btn');
|
3 |
const batchModeBtn = document.getElementById('batch-mode-btn');
|
4 |
const keywordModeBtn = document.getElementById("keyword-mode-btn");
|
|
|
5 |
|
6 |
const singleInput = document.querySelector('.single-input');
|
7 |
const batchInput = document.querySelector('.batch-input');
|
8 |
const keywordSearchInput = document.querySelector(".keyword-input");
|
|
|
9 |
|
10 |
const docIdInput = document.getElementById('doc-id');
|
11 |
const batchIdsInput = document.getElementById('batch-ids');
|
|
|
26 |
singleModeBtn.classList.add('active');
|
27 |
keywordModeBtn.classList.remove("active");
|
28 |
batchModeBtn.classList.remove('active');
|
|
|
29 |
|
30 |
singleInput.style.display = 'block';
|
31 |
batchInput.style.display = 'none';
|
32 |
keywordSearchInput.style.display = "none";
|
|
|
33 |
});
|
34 |
|
35 |
batchModeBtn.addEventListener('click', () => {
|
36 |
batchModeBtn.classList.add('active');
|
37 |
keywordModeBtn.classList.remove("active");
|
38 |
singleModeBtn.classList.remove('active');
|
|
|
39 |
|
40 |
batchInput.style.display = 'block';
|
41 |
keywordSearchInput.style.display = "none";
|
|
|
42 |
singleInput.style.display = 'none';
|
43 |
});
|
44 |
|
|
|
46 |
keywordModeBtn.classList.add("active");
|
47 |
singleModeBtn.classList.remove('active');
|
48 |
batchModeBtn.classList.remove("active");
|
|
|
49 |
|
50 |
singleInput.style.display = "none";
|
51 |
batchInput.style.display = "none";
|
|
|
52 |
keywordSearchInput.style.display = "block";
|
53 |
})
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
keywordSearchBtn.addEventListener("click", async ()=>{
|
56 |
const keywords = keywordInput.value.trim();
|
57 |
if (!keywords) {
|
|
|
171 |
|
172 |
const resultItem = document.createElement('div');
|
173 |
resultItem.className = 'result-item';
|
174 |
+
let scopeItem = data.scope ? `<p>Scope : ${data.scope}</p>` : ""
|
175 |
resultItem.innerHTML = `
|
176 |
<div class="result-header">
|
177 |
<div class="result-id">${data.doc_id}</div>
|
|
|
179 |
</div>
|
180 |
<div class="result-url">
|
181 |
<a href="${data.url}" target="_blank">${data.url}</a>
|
182 |
+
${scopeItem}
|
183 |
</div>
|
184 |
`;
|
185 |
|
static/style.css
CHANGED
@@ -171,10 +171,6 @@ header {
|
|
171 |
display: none;
|
172 |
}
|
173 |
|
174 |
-
.indexer-buttons {
|
175 |
-
display: none;
|
176 |
-
}
|
177 |
-
|
178 |
.batch-input textarea {
|
179 |
width: 100%;
|
180 |
height: 120px;
|
|
|
171 |
display: none;
|
172 |
}
|
173 |
|
|
|
|
|
|
|
|
|
174 |
.batch-input textarea {
|
175 |
width: 100%;
|
176 |
height: 120px;
|
templates/index.html
CHANGED
@@ -28,7 +28,6 @@
|
|
28 |
<button id="single-mode-btn" class="active">Single Document</button>
|
29 |
<button id="batch-mode-btn">Batch Search</button>
|
30 |
<button id="keyword-mode-btn">Keyword Search</button>
|
31 |
-
<button id="indexer-mode-btn">Indexer Options</button>
|
32 |
</div>
|
33 |
|
34 |
<div class="search-form">
|
@@ -54,14 +53,6 @@
|
|
54 |
<button id="keyword-search-btn" class="btn">Search</button>
|
55 |
</div>
|
56 |
</div>
|
57 |
-
|
58 |
-
<div class="input-group indexer-buttons">
|
59 |
-
<label for="indexerBtns">Actions</label>
|
60 |
-
<div class="input-field">
|
61 |
-
<button id="indexing-btn" class="btn">Index all files</button>
|
62 |
-
<button id="testing-btn" class="btn">Test theory</button>
|
63 |
-
</div>
|
64 |
-
</div>
|
65 |
</div>
|
66 |
|
67 |
<div class="error-message" id="error-message"></div>
|
|
|
28 |
<button id="single-mode-btn" class="active">Single Document</button>
|
29 |
<button id="batch-mode-btn">Batch Search</button>
|
30 |
<button id="keyword-mode-btn">Keyword Search</button>
|
|
|
31 |
</div>
|
32 |
|
33 |
<div class="search-form">
|
|
|
53 |
<button id="keyword-search-btn" class="btn">Search</button>
|
54 |
</div>
|
55 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
</div>
|
57 |
|
58 |
<div class="error-message" id="error-message"></div>
|