Omar ID EL MOUMEN commited on
Commit
dab8149
·
1 Parent(s): 714cb03

Remove indexer options + add scope extractor functions + implemented on single document search

Browse files
Files changed (4) hide show
  1. app.py +72 -2
  2. static/script.js +2 -20
  3. static/style.css +0 -4
  4. templates/index.html +0 -9
app.py CHANGED
@@ -5,9 +5,15 @@ import requests
5
  from bs4 import BeautifulSoup
6
  import json
7
  import os
 
 
 
 
 
 
 
8
  import time
9
  from datetime import datetime
10
- import traceback
11
  from dotenv import load_dotenv
12
  import warnings
13
  from fastapi import FastAPI, HTTPException
@@ -38,6 +44,63 @@ app.add_middleware(
38
  allow_headers=["*"],
39
  )
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  class DocRequest(BaseModel):
42
  doc_id: str
43
  release: Optional[int] = None
@@ -45,6 +108,7 @@ class DocRequest(BaseModel):
45
  class DocResponse(BaseModel):
46
  doc_id: str
47
  url: str
 
48
  search_time: float
49
 
50
  class BatchDocRequest(BaseModel):
@@ -294,6 +358,7 @@ def search_spec(request: KeywordRequest):
294
  )
295
  else:
296
  raise HTTPException(status_code=404, detail="Specification not found")
 
297
  @app.post("/find", response_model=DocResponse)
298
  def find_document(request: DocRequest):
299
  start_time = time.time()
@@ -301,13 +366,18 @@ def find_document(request: DocRequest):
301
  print(finder)
302
 
303
  result = finder.search_document(request.doc_id, request.release)
304
- print(result)
305
 
306
  if "not found" not in result and "Could not" not in result and "Unable" not in result:
 
307
  return DocResponse(
308
  doc_id=request.doc_id,
309
  url=result,
310
  search_time=time.time() - start_time
 
 
 
 
 
311
  )
312
  else:
313
  raise HTTPException(status_code=404, detail=result)
 
5
  from bs4 import BeautifulSoup
6
  import json
7
  import os
8
+ import pymupdf as fitz
9
+ import uuid
10
+ import zipfile
11
+ import io
12
+ import subprocess
13
+ import os
14
+ import re
15
  import time
16
  from datetime import datetime
 
17
  from dotenv import load_dotenv
18
  import warnings
19
  from fastapi import FastAPI, HTTPException
 
44
  allow_headers=["*"],
45
  )
46
 
47
+ def get_pdf_bytes(specification: str, version: str):
48
+ doc_id = specification
49
+ series = doc_id.split(".")[0]
50
+ response = requests.get(f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip", verify=False)
51
+ if response.status_code != 200:
52
+ raise Exception("Téléchargement du ZIP échoué")
53
+
54
+ zip_bytes = io.BytesIO(response.content)
55
+
56
+ with zipfile.ZipFile(zip_bytes) as zf:
57
+ for file_name in zf.namelist():
58
+ if file_name.endswith("doc") or file_name.endswith("docx"):
59
+ ext = file_name.split(".")[-1]
60
+ doc_bytes = zf.read(file_name)
61
+ temp_id = str(uuid.uuid4())
62
+ input_path = f"/tmp/{temp_id}.{ext}"
63
+ output_path = f"/tmp/{temp_id}.pdf"
64
+
65
+ with open(input_path, "wb") as f:
66
+ f.write(doc_bytes)
67
+
68
+ subprocess.run([
69
+ "libreoffice",
70
+ "--headless",
71
+ "--convert-to", "pdf",
72
+ "--outdir", "/tmp",
73
+ input_path
74
+ ], check=True)
75
+
76
+ with open(output_path, "rb") as f:
77
+ pdf_data = f.read()
78
+
79
+ os.remove(input_path)
80
+ os.remove(output_path)
81
+
82
+ return io.BytesIO(pdf_data)
83
+ raise Exception("Aucun fichier .doc/.docx trouvé dans le ZIP")
84
+
85
+ def get_scope(specification: str, version: str):
86
+ pdf_bytes = get_pdf_bytes(specification, version)
87
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
88
+
89
+ for content in doc.get_toc():
90
+ if "scope" in content[1].lower():
91
+ page_num = content[2] - 1
92
+ break
93
+
94
+ doc = doc[page_num:]
95
+
96
+ pdf_full_text = " ".join(page.get_text("text") for page in doc)
97
+ pdf_postprocess_text = re.sub(r"\s+", " ", pdf_full_text)
98
+ pdf_postprocess_text = pdf_postprocess_text.replace("1 Scope", " !-! ")
99
+ pdf_postprocess_text = pdf_postprocess_text.replace("2 Reference", " !-! ")
100
+ pdf_postprocess_text = pdf_postprocess_text.replace("", "- ")
101
+
102
+ return pdf_postprocess_text.split(" !-! ")[1]
103
+
104
  class DocRequest(BaseModel):
105
  doc_id: str
106
  release: Optional[int] = None
 
108
  class DocResponse(BaseModel):
109
  doc_id: str
110
  url: str
111
+ scope: Optional[str] = None
112
  search_time: float
113
 
114
  class BatchDocRequest(BaseModel):
 
358
  )
359
  else:
360
  raise HTTPException(status_code=404, detail="Specification not found")
361
+
362
  @app.post("/find", response_model=DocResponse)
363
  def find_document(request: DocRequest):
364
  start_time = time.time()
 
366
  print(finder)
367
 
368
  result = finder.search_document(request.doc_id, request.release)
 
369
 
370
  if "not found" not in result and "Could not" not in result and "Unable" not in result:
371
+ version = result.split("/")[-1].replace(".zip", "").split("-")[-1]
372
  return DocResponse(
373
  doc_id=request.doc_id,
374
  url=result,
375
  search_time=time.time() - start_time
376
+ ) if isinstance(finder, TsgDocFinder) else DocResponse(
377
+ doc_id=request.doc_id,
378
+ url=result,
379
+ search_time=time.time() - start_time,
380
+ scope=get_scope(request.doc_id, version)
381
  )
382
  else:
383
  raise HTTPException(status_code=404, detail=result)
static/script.js CHANGED
@@ -2,12 +2,10 @@
2
  const singleModeBtn = document.getElementById('single-mode-btn');
3
  const batchModeBtn = document.getElementById('batch-mode-btn');
4
  const keywordModeBtn = document.getElementById("keyword-mode-btn");
5
- const indexerModeBtn = document.getElementById("indexer-mode-btn")
6
 
7
  const singleInput = document.querySelector('.single-input');
8
  const batchInput = document.querySelector('.batch-input');
9
  const keywordSearchInput = document.querySelector(".keyword-input");
10
- const indexerButtons = document.querySelector(".indexer-buttons")
11
 
12
  const docIdInput = document.getElementById('doc-id');
13
  const batchIdsInput = document.getElementById('batch-ids');
@@ -28,23 +26,19 @@ singleModeBtn.addEventListener('click', () => {
28
  singleModeBtn.classList.add('active');
29
  keywordModeBtn.classList.remove("active");
30
  batchModeBtn.classList.remove('active');
31
- indexerModeBtn.classList.remove("active");
32
 
33
  singleInput.style.display = 'block';
34
  batchInput.style.display = 'none';
35
  keywordSearchInput.style.display = "none";
36
- indexerButtons.style.display = "none";
37
  });
38
 
39
  batchModeBtn.addEventListener('click', () => {
40
  batchModeBtn.classList.add('active');
41
  keywordModeBtn.classList.remove("active");
42
  singleModeBtn.classList.remove('active');
43
- indexerModeBtn.classList.remove("active");
44
 
45
  batchInput.style.display = 'block';
46
  keywordSearchInput.style.display = "none";
47
- indexerButtons.style.display = "none";
48
  singleInput.style.display = 'none';
49
  });
50
 
@@ -52,26 +46,12 @@ keywordModeBtn.addEventListener('click', () => {
52
  keywordModeBtn.classList.add("active");
53
  singleModeBtn.classList.remove('active');
54
  batchModeBtn.classList.remove("active");
55
- indexerModeBtn.classList.remove("active");
56
 
57
  singleInput.style.display = "none";
58
  batchInput.style.display = "none";
59
- indexerButtons.style.display = "none";
60
  keywordSearchInput.style.display = "block";
61
  })
62
 
63
- indexerModeBtn.addEventListener('click', ()=>{
64
- keywordModeBtn.classList.remove("active");
65
- singleModeBtn.classList.remove('active');
66
- batchModeBtn.classList.remove("active");
67
- indexerModeBtn.classList.add("active");
68
-
69
- singleInput.style.display = "none";
70
- batchInput.style.display = "none";
71
- indexerButtons.style.display = "block";
72
- keywordSearchInput.style.display = "none";
73
- })
74
-
75
  keywordSearchBtn.addEventListener("click", async ()=>{
76
  const keywords = keywordInput.value.trim();
77
  if (!keywords) {
@@ -191,6 +171,7 @@ function displaySingleResult(data) {
191
 
192
  const resultItem = document.createElement('div');
193
  resultItem.className = 'result-item';
 
194
  resultItem.innerHTML = `
195
  <div class="result-header">
196
  <div class="result-id">${data.doc_id}</div>
@@ -198,6 +179,7 @@ function displaySingleResult(data) {
198
  </div>
199
  <div class="result-url">
200
  <a href="${data.url}" target="_blank">${data.url}</a>
 
201
  </div>
202
  `;
203
 
 
2
  const singleModeBtn = document.getElementById('single-mode-btn');
3
  const batchModeBtn = document.getElementById('batch-mode-btn');
4
  const keywordModeBtn = document.getElementById("keyword-mode-btn");
 
5
 
6
  const singleInput = document.querySelector('.single-input');
7
  const batchInput = document.querySelector('.batch-input');
8
  const keywordSearchInput = document.querySelector(".keyword-input");
 
9
 
10
  const docIdInput = document.getElementById('doc-id');
11
  const batchIdsInput = document.getElementById('batch-ids');
 
26
  singleModeBtn.classList.add('active');
27
  keywordModeBtn.classList.remove("active");
28
  batchModeBtn.classList.remove('active');
 
29
 
30
  singleInput.style.display = 'block';
31
  batchInput.style.display = 'none';
32
  keywordSearchInput.style.display = "none";
 
33
  });
34
 
35
  batchModeBtn.addEventListener('click', () => {
36
  batchModeBtn.classList.add('active');
37
  keywordModeBtn.classList.remove("active");
38
  singleModeBtn.classList.remove('active');
 
39
 
40
  batchInput.style.display = 'block';
41
  keywordSearchInput.style.display = "none";
 
42
  singleInput.style.display = 'none';
43
  });
44
 
 
46
  keywordModeBtn.classList.add("active");
47
  singleModeBtn.classList.remove('active');
48
  batchModeBtn.classList.remove("active");
 
49
 
50
  singleInput.style.display = "none";
51
  batchInput.style.display = "none";
 
52
  keywordSearchInput.style.display = "block";
53
  })
54
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  keywordSearchBtn.addEventListener("click", async ()=>{
56
  const keywords = keywordInput.value.trim();
57
  if (!keywords) {
 
171
 
172
  const resultItem = document.createElement('div');
173
  resultItem.className = 'result-item';
174
+ let scopeItem = data.scope ? `<p>Scope : ${data.scope}</p>` : ""
175
  resultItem.innerHTML = `
176
  <div class="result-header">
177
  <div class="result-id">${data.doc_id}</div>
 
179
  </div>
180
  <div class="result-url">
181
  <a href="${data.url}" target="_blank">${data.url}</a>
182
+ ${scopeItem}
183
  </div>
184
  `;
185
 
static/style.css CHANGED
@@ -171,10 +171,6 @@ header {
171
  display: none;
172
  }
173
 
174
- .indexer-buttons {
175
- display: none;
176
- }
177
-
178
  .batch-input textarea {
179
  width: 100%;
180
  height: 120px;
 
171
  display: none;
172
  }
173
 
 
 
 
 
174
  .batch-input textarea {
175
  width: 100%;
176
  height: 120px;
templates/index.html CHANGED
@@ -28,7 +28,6 @@
28
  <button id="single-mode-btn" class="active">Single Document</button>
29
  <button id="batch-mode-btn">Batch Search</button>
30
  <button id="keyword-mode-btn">Keyword Search</button>
31
- <button id="indexer-mode-btn">Indexer Options</button>
32
  </div>
33
 
34
  <div class="search-form">
@@ -54,14 +53,6 @@
54
  <button id="keyword-search-btn" class="btn">Search</button>
55
  </div>
56
  </div>
57
-
58
- <div class="input-group indexer-buttons">
59
- <label for="indexerBtns">Actions</label>
60
- <div class="input-field">
61
- <button id="indexing-btn" class="btn">Index all files</button>
62
- <button id="testing-btn" class="btn">Test theory</button>
63
- </div>
64
- </div>
65
  </div>
66
 
67
  <div class="error-message" id="error-message"></div>
 
28
  <button id="single-mode-btn" class="active">Single Document</button>
29
  <button id="batch-mode-btn">Batch Search</button>
30
  <button id="keyword-mode-btn">Keyword Search</button>
 
31
  </div>
32
 
33
  <div class="search-form">
 
53
  <button id="keyword-search-btn" class="btn">Search</button>
54
  </div>
55
  </div>
 
 
 
 
 
 
 
 
56
  </div>
57
 
58
  <div class="error-message" id="error-message"></div>