Omar ID EL MOUMEN
commited on
Commit
·
acaccf0
1
Parent(s):
8840360
Add specification search
Browse files- app.py +60 -22
- static/script.js +3 -3
app.py
CHANGED
@@ -32,15 +32,17 @@ app.add_middleware(
|
|
32 |
)
|
33 |
|
34 |
class DocRequest(BaseModel):
|
35 |
-
|
|
|
36 |
|
37 |
class DocResponse(BaseModel):
|
38 |
-
|
39 |
url: str
|
40 |
search_time: float
|
41 |
|
42 |
class BatchDocRequest(BaseModel):
|
43 |
-
|
|
|
44 |
|
45 |
class BatchDocResponse(BaseModel):
|
46 |
results: Dict[str, str]
|
@@ -65,12 +67,12 @@ class TsgDocFinder:
|
|
65 |
with open(self.indexer_file, "w", encoding="utf-8") as f:
|
66 |
json.dump(self.indexer, f, indent=4, ensure_ascii=False)
|
67 |
|
68 |
-
def get_workgroup(self,
|
69 |
-
main_tsg = "tsg_ct" if
|
70 |
if main_tsg is None:
|
71 |
return None, None, None
|
72 |
-
workgroup = f"WG{int(
|
73 |
-
return main_tsg, workgroup,
|
74 |
|
75 |
def find_workgroup_url(self, main_tsg, workgroup):
|
76 |
"""Find the URL for the specific workgroup"""
|
@@ -94,25 +96,25 @@ class TsgDocFinder:
|
|
94 |
print(f"Error accessing {url}: {e}")
|
95 |
return []
|
96 |
|
97 |
-
def search_document(self,
|
98 |
"""Search for a specific document by its ID"""
|
99 |
-
original_id =
|
100 |
|
101 |
# Check if already indexed
|
102 |
if original_id in self.indexer:
|
103 |
return self.indexer[original_id]
|
104 |
|
105 |
# Parse the document ID
|
106 |
-
main_tsg, workgroup,
|
107 |
if not main_tsg:
|
108 |
-
return f"Could not parse document ID: {
|
109 |
|
110 |
-
print(f"Searching for {original_id} (parsed as {
|
111 |
|
112 |
# Find the workgroup URL
|
113 |
wg_url = self.find_workgroup_url(main_tsg, workgroup)
|
114 |
if not wg_url:
|
115 |
-
return f"Could not find workgroup for {
|
116 |
|
117 |
# Search in the workgroup directories
|
118 |
meeting_folders = self.get_docs_from_url(wg_url)
|
@@ -128,7 +130,7 @@ class TsgDocFinder:
|
|
128 |
|
129 |
# Check for the document in the main Docs folder
|
130 |
for file in files:
|
131 |
-
if
|
132 |
doc_url = f"{docs_url}/{file}"
|
133 |
self.indexer[original_id] = doc_url
|
134 |
self.save_indexer()
|
@@ -141,16 +143,47 @@ class TsgDocFinder:
|
|
141 |
zip_files = self.get_docs_from_url(zip_url)
|
142 |
|
143 |
for file in zip_files:
|
144 |
-
if
|
145 |
doc_url = f"{zip_url}/{file}"
|
146 |
self.indexer[original_id] = doc_url
|
147 |
self.save_indexer()
|
148 |
return doc_url
|
149 |
|
150 |
-
return f"Document {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
-
# Create a global instance of the finder
|
153 |
-
finder = TsgDocFinder()
|
154 |
|
155 |
@app.get("/")
|
156 |
async def main_menu():
|
@@ -159,12 +192,15 @@ async def main_menu():
|
|
159 |
@app.post("/find", response_model=DocResponse)
|
160 |
def find_document(request: DocRequest):
|
161 |
start_time = time.time()
|
|
|
|
|
162 |
|
163 |
-
result = finder.search_document(request.
|
164 |
-
|
|
|
165 |
if "not found" not in result and "Could not" not in result:
|
166 |
return DocResponse(
|
167 |
-
|
168 |
url=result,
|
169 |
search_time=time.time() - start_time
|
170 |
)
|
@@ -178,7 +214,8 @@ def find_documents_batch(request: BatchDocRequest):
|
|
178 |
results = {}
|
179 |
missing = []
|
180 |
|
181 |
-
for doc_id in request.
|
|
|
182 |
result = finder.search_document(doc_id)
|
183 |
if "not found" not in result and "Could not" not in result:
|
184 |
results[doc_id] = result
|
@@ -193,4 +230,5 @@ def find_documents_batch(request: BatchDocRequest):
|
|
193 |
|
194 |
@app.get("/indexed", response_model=List[str])
|
195 |
def get_indexed_documents():
|
|
|
196 |
return list(finder.indexer.keys())
|
|
|
32 |
)
|
33 |
|
34 |
class DocRequest(BaseModel):
|
35 |
+
doc_id: str
|
36 |
+
release: Optional[int] = None
|
37 |
|
38 |
class DocResponse(BaseModel):
|
39 |
+
doc_id: str
|
40 |
url: str
|
41 |
search_time: float
|
42 |
|
43 |
class BatchDocRequest(BaseModel):
|
44 |
+
doc_ids: List[str]
|
45 |
+
release: Optional[int] = None
|
46 |
|
47 |
class BatchDocResponse(BaseModel):
|
48 |
results: Dict[str, str]
|
|
|
67 |
with open(self.indexer_file, "w", encoding="utf-8") as f:
|
68 |
json.dump(self.indexer, f, indent=4, ensure_ascii=False)
|
69 |
|
70 |
+
def get_workgroup(self, doc):
|
71 |
+
main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
|
72 |
if main_tsg is None:
|
73 |
return None, None, None
|
74 |
+
workgroup = f"WG{int(doc[1])}" if doc[1].isnumeric() else main_tsg.upper()
|
75 |
+
return main_tsg, workgroup, doc
|
76 |
|
77 |
def find_workgroup_url(self, main_tsg, workgroup):
|
78 |
"""Find the URL for the specific workgroup"""
|
|
|
96 |
print(f"Error accessing {url}: {e}")
|
97 |
return []
|
98 |
|
99 |
+
def search_document(self, doc_id, release = None):
|
100 |
"""Search for a specific document by its ID"""
|
101 |
+
original_id = doc_id
|
102 |
|
103 |
# Check if already indexed
|
104 |
if original_id in self.indexer:
|
105 |
return self.indexer[original_id]
|
106 |
|
107 |
# Parse the document ID
|
108 |
+
main_tsg, workgroup, doc = self.get_workgroup(doc_id)
|
109 |
if not main_tsg:
|
110 |
+
return f"Could not parse document ID: {doc_id}"
|
111 |
|
112 |
+
print(f"Searching for {original_id} (parsed as {doc}) in {main_tsg}/{workgroup}...")
|
113 |
|
114 |
# Find the workgroup URL
|
115 |
wg_url = self.find_workgroup_url(main_tsg, workgroup)
|
116 |
if not wg_url:
|
117 |
+
return f"Could not find workgroup for {doc_id}"
|
118 |
|
119 |
# Search in the workgroup directories
|
120 |
meeting_folders = self.get_docs_from_url(wg_url)
|
|
|
130 |
|
131 |
# Check for the document in the main Docs folder
|
132 |
for file in files:
|
133 |
+
if doc in file.lower() or original_id in file:
|
134 |
doc_url = f"{docs_url}/{file}"
|
135 |
self.indexer[original_id] = doc_url
|
136 |
self.save_indexer()
|
|
|
143 |
zip_files = self.get_docs_from_url(zip_url)
|
144 |
|
145 |
for file in zip_files:
|
146 |
+
if doc in file.lower() or original_id in file:
|
147 |
doc_url = f"{zip_url}/{file}"
|
148 |
self.indexer[original_id] = doc_url
|
149 |
self.save_indexer()
|
150 |
return doc_url
|
151 |
|
152 |
+
return f"Document {doc_id} not found"
|
153 |
+
|
154 |
+
|
155 |
+
class SpecDocFinder:
|
156 |
+
def __init__(self):
|
157 |
+
self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
|
158 |
+
|
159 |
+
def search_document(self, doc_id, release):
|
160 |
+
series = doc_id.split(".")[0]
|
161 |
+
while len(series) < 2:
|
162 |
+
series = "0" + series
|
163 |
+
|
164 |
+
url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}"
|
165 |
+
|
166 |
+
response = requests.get(url, verify=False)
|
167 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
168 |
+
items = soup.find_all("tr")[1:]
|
169 |
+
version_found = None
|
170 |
+
if release is None:
|
171 |
+
item = items[-1].find("a")
|
172 |
+
a, b, c = [_ for _ in item.get_text().split("-")[1].replace(".zip", "")]
|
173 |
+
version = f"{self.chars.index(a)}.{self.chars.index(b)}.{self.chars.index(c)}"
|
174 |
+
version_found = (version, item.get("href"))
|
175 |
+
_, spec_url = version_found
|
176 |
+
return spec_url if version_found is not None else f"Specification {doc_id} not found"
|
177 |
+
else:
|
178 |
+
for item in items:
|
179 |
+
x = item.find("a")
|
180 |
+
if f"{doc_id.replace('.', '')}-{self.chars[int(release)]}" in x.get_text():
|
181 |
+
a, b, c = [_ for _ in x.get_text().split("-")[1].replace(".zip", "")]
|
182 |
+
version = f"{self.chars.index(a)}.{self.chars.index(b)}.{self.chars.index(c)}"
|
183 |
+
version_found = (version, x.get("href"))
|
184 |
+
_, spec_url = version_found
|
185 |
+
return spec_url if version_found is not None else f"Specification {doc_id} not found"
|
186 |
|
|
|
|
|
187 |
|
188 |
@app.get("/")
|
189 |
async def main_menu():
|
|
|
192 |
@app.post("/find", response_model=DocResponse)
|
193 |
def find_document(request: DocRequest):
|
194 |
start_time = time.time()
|
195 |
+
finder = TsgDocFinder() if request.doc_id[0].isalpha() else SpecDocFinder()
|
196 |
+
print(finder)
|
197 |
|
198 |
+
result = finder.search_document(request.doc_id, request.release)
|
199 |
+
print(result)
|
200 |
+
|
201 |
if "not found" not in result and "Could not" not in result:
|
202 |
return DocResponse(
|
203 |
+
doc_id=request.doc_id,
|
204 |
url=result,
|
205 |
search_time=time.time() - start_time
|
206 |
)
|
|
|
214 |
results = {}
|
215 |
missing = []
|
216 |
|
217 |
+
for doc_id in request.doc_ids:
|
218 |
+
finder = TsgDocFinder() if request.doc_id[0].isalpha() else SpecDocFinder()
|
219 |
result = finder.search_document(doc_id)
|
220 |
if "not found" not in result and "Could not" not in result:
|
221 |
results[doc_id] = result
|
|
|
230 |
|
231 |
@app.get("/indexed", response_model=List[str])
|
232 |
def get_indexed_documents():
|
233 |
+
finder = TsgDocFinder()
|
234 |
return list(finder.indexer.keys())
|
static/script.js
CHANGED
@@ -48,7 +48,7 @@ searchBtn.addEventListener('click', async () => {
|
|
48 |
headers: {
|
49 |
'Content-Type': 'application/json'
|
50 |
},
|
51 |
-
body: JSON.stringify({
|
52 |
});
|
53 |
|
54 |
const data = await response.json();
|
@@ -93,7 +93,7 @@ batchSearchBtn.addEventListener('click', async () => {
|
|
93 |
headers: {
|
94 |
'Content-Type': 'application/json'
|
95 |
},
|
96 |
-
body: JSON.stringify({
|
97 |
});
|
98 |
|
99 |
const data = await response.json();
|
@@ -120,7 +120,7 @@ function displaySingleResult(data) {
|
|
120 |
resultItem.className = 'result-item';
|
121 |
resultItem.innerHTML = `
|
122 |
<div class="result-header">
|
123 |
-
<div class="result-id">${data.
|
124 |
<div class="result-status status-found">Found</div>
|
125 |
</div>
|
126 |
<div class="result-url">
|
|
|
48 |
headers: {
|
49 |
'Content-Type': 'application/json'
|
50 |
},
|
51 |
+
body: JSON.stringify({ doc_id: docId, release: null })
|
52 |
});
|
53 |
|
54 |
const data = await response.json();
|
|
|
93 |
headers: {
|
94 |
'Content-Type': 'application/json'
|
95 |
},
|
96 |
+
body: JSON.stringify({ doc_ids: docIds })
|
97 |
});
|
98 |
|
99 |
const data = await response.json();
|
|
|
120 |
resultItem.className = 'result-item';
|
121 |
resultItem.innerHTML = `
|
122 |
<div class="result-header">
|
123 |
+
<div class="result-id">${data.doc_id}</div>
|
124 |
<div class="result-status status-found">Found</div>
|
125 |
</div>
|
126 |
<div class="result-url">
|