Omar ID EL MOUMEN commited on
Commit
acaccf0
·
1 Parent(s): 8840360

Add specification search

Browse files
Files changed (2) hide show
  1. app.py +60 -22
  2. static/script.js +3 -3
app.py CHANGED
@@ -32,15 +32,17 @@ app.add_middleware(
32
  )
33
 
34
  class DocRequest(BaseModel):
35
- tsg_doc_id: str
 
36
 
37
  class DocResponse(BaseModel):
38
- tsg_doc_id: str
39
  url: str
40
  search_time: float
41
 
42
  class BatchDocRequest(BaseModel):
43
- tsg_doc_ids: List[str]
 
44
 
45
  class BatchDocResponse(BaseModel):
46
  results: Dict[str, str]
@@ -65,12 +67,12 @@ class TsgDocFinder:
65
  with open(self.indexer_file, "w", encoding="utf-8") as f:
66
  json.dump(self.indexer, f, indent=4, ensure_ascii=False)
67
 
68
- def get_workgroup(self, tsg_doc):
69
- main_tsg = "tsg_ct" if tsg_doc[0] == "C" else "tsg_sa" if tsg_doc[0] == "S" else None
70
  if main_tsg is None:
71
  return None, None, None
72
- workgroup = f"WG{int(tsg_doc[1])}" if tsg_doc[1].isnumeric() else main_tsg.upper()
73
- return main_tsg, workgroup, tsg_doc
74
 
75
  def find_workgroup_url(self, main_tsg, workgroup):
76
  """Find the URL for the specific workgroup"""
@@ -94,25 +96,25 @@ class TsgDocFinder:
94
  print(f"Error accessing {url}: {e}")
95
  return []
96
 
97
- def search_document(self, tsg_doc_id):
98
  """Search for a specific document by its ID"""
99
- original_id = tsg_doc_id
100
 
101
  # Check if already indexed
102
  if original_id in self.indexer:
103
  return self.indexer[original_id]
104
 
105
  # Parse the document ID
106
- main_tsg, workgroup, tsg_doc = self.get_workgroup(tsg_doc_id)
107
  if not main_tsg:
108
- return f"Could not parse document ID: {tsg_doc_id}"
109
 
110
- print(f"Searching for {original_id} (parsed as {tsg_doc}) in {main_tsg}/{workgroup}...")
111
 
112
  # Find the workgroup URL
113
  wg_url = self.find_workgroup_url(main_tsg, workgroup)
114
  if not wg_url:
115
- return f"Could not find workgroup for {tsg_doc_id}"
116
 
117
  # Search in the workgroup directories
118
  meeting_folders = self.get_docs_from_url(wg_url)
@@ -128,7 +130,7 @@ class TsgDocFinder:
128
 
129
  # Check for the document in the main Docs folder
130
  for file in files:
131
- if tsg_doc in file.lower() or original_id in file:
132
  doc_url = f"{docs_url}/{file}"
133
  self.indexer[original_id] = doc_url
134
  self.save_indexer()
@@ -141,16 +143,47 @@ class TsgDocFinder:
141
  zip_files = self.get_docs_from_url(zip_url)
142
 
143
  for file in zip_files:
144
- if tsg_doc in file.lower() or original_id in file:
145
  doc_url = f"{zip_url}/{file}"
146
  self.indexer[original_id] = doc_url
147
  self.save_indexer()
148
  return doc_url
149
 
150
- return f"Document {tsg_doc_id} not found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Create a global instance of the finder
153
- finder = TsgDocFinder()
154
 
155
  @app.get("/")
156
  async def main_menu():
@@ -159,12 +192,15 @@ async def main_menu():
159
  @app.post("/find", response_model=DocResponse)
160
  def find_document(request: DocRequest):
161
  start_time = time.time()
 
 
162
 
163
- result = finder.search_document(request.tsg_doc_id)
164
-
 
165
  if "not found" not in result and "Could not" not in result:
166
  return DocResponse(
167
- tsg_doc_id=request.tsg_doc_id,
168
  url=result,
169
  search_time=time.time() - start_time
170
  )
@@ -178,7 +214,8 @@ def find_documents_batch(request: BatchDocRequest):
178
  results = {}
179
  missing = []
180
 
181
- for doc_id in request.tsg_doc_ids:
 
182
  result = finder.search_document(doc_id)
183
  if "not found" not in result and "Could not" not in result:
184
  results[doc_id] = result
@@ -193,4 +230,5 @@ def find_documents_batch(request: BatchDocRequest):
193
 
194
  @app.get("/indexed", response_model=List[str])
195
  def get_indexed_documents():
 
196
  return list(finder.indexer.keys())
 
32
  )
33
 
34
  class DocRequest(BaseModel):
35
+ doc_id: str
36
+ release: Optional[int] = None
37
 
38
  class DocResponse(BaseModel):
39
+ doc_id: str
40
  url: str
41
  search_time: float
42
 
43
  class BatchDocRequest(BaseModel):
44
+ doc_ids: List[str]
45
+ release: Optional[int] = None
46
 
47
  class BatchDocResponse(BaseModel):
48
  results: Dict[str, str]
 
67
  with open(self.indexer_file, "w", encoding="utf-8") as f:
68
  json.dump(self.indexer, f, indent=4, ensure_ascii=False)
69
 
70
+ def get_workgroup(self, doc):
71
+ main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
72
  if main_tsg is None:
73
  return None, None, None
74
+ workgroup = f"WG{int(doc[1])}" if doc[1].isnumeric() else main_tsg.upper()
75
+ return main_tsg, workgroup, doc
76
 
77
  def find_workgroup_url(self, main_tsg, workgroup):
78
  """Find the URL for the specific workgroup"""
 
96
  print(f"Error accessing {url}: {e}")
97
  return []
98
 
99
+ def search_document(self, doc_id, release = None):
100
  """Search for a specific document by its ID"""
101
+ original_id = doc_id
102
 
103
  # Check if already indexed
104
  if original_id in self.indexer:
105
  return self.indexer[original_id]
106
 
107
  # Parse the document ID
108
+ main_tsg, workgroup, doc = self.get_workgroup(doc_id)
109
  if not main_tsg:
110
+ return f"Could not parse document ID: {doc_id}"
111
 
112
+ print(f"Searching for {original_id} (parsed as {doc}) in {main_tsg}/{workgroup}...")
113
 
114
  # Find the workgroup URL
115
  wg_url = self.find_workgroup_url(main_tsg, workgroup)
116
  if not wg_url:
117
+ return f"Could not find workgroup for {doc_id}"
118
 
119
  # Search in the workgroup directories
120
  meeting_folders = self.get_docs_from_url(wg_url)
 
130
 
131
  # Check for the document in the main Docs folder
132
  for file in files:
133
+ if doc in file.lower() or original_id in file:
134
  doc_url = f"{docs_url}/{file}"
135
  self.indexer[original_id] = doc_url
136
  self.save_indexer()
 
143
  zip_files = self.get_docs_from_url(zip_url)
144
 
145
  for file in zip_files:
146
+ if doc in file.lower() or original_id in file:
147
  doc_url = f"{zip_url}/{file}"
148
  self.indexer[original_id] = doc_url
149
  self.save_indexer()
150
  return doc_url
151
 
152
+ return f"Document {doc_id} not found"
153
+
154
+
155
+ class SpecDocFinder:
156
+ def __init__(self):
157
+ self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
158
+
159
+ def search_document(self, doc_id, release):
160
+ series = doc_id.split(".")[0]
161
+ while len(series) < 2:
162
+ series = "0" + series
163
+
164
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}"
165
+
166
+ response = requests.get(url, verify=False)
167
+ soup = BeautifulSoup(response.text, 'html.parser')
168
+ items = soup.find_all("tr")[1:]
169
+ version_found = None
170
+ if release is None:
171
+ item = items[-1].find("a")
172
+ a, b, c = [_ for _ in item.get_text().split("-")[1].replace(".zip", "")]
173
+ version = f"{self.chars.index(a)}.{self.chars.index(b)}.{self.chars.index(c)}"
174
+ version_found = (version, item.get("href"))
175
+ _, spec_url = version_found
176
+ return spec_url if version_found is not None else f"Specification {doc_id} not found"
177
+ else:
178
+ for item in items:
179
+ x = item.find("a")
180
+ if f"{doc_id.replace('.', '')}-{self.chars[int(release)]}" in x.get_text():
181
+ a, b, c = [_ for _ in x.get_text().split("-")[1].replace(".zip", "")]
182
+ version = f"{self.chars.index(a)}.{self.chars.index(b)}.{self.chars.index(c)}"
183
+ version_found = (version, x.get("href"))
184
+ _, spec_url = version_found
185
+ return spec_url if version_found is not None else f"Specification {doc_id} not found"
186
 
 
 
187
 
188
  @app.get("/")
189
  async def main_menu():
 
192
  @app.post("/find", response_model=DocResponse)
193
  def find_document(request: DocRequest):
194
  start_time = time.time()
195
+ finder = TsgDocFinder() if request.doc_id[0].isalpha() else SpecDocFinder()
196
+ print(finder)
197
 
198
+ result = finder.search_document(request.doc_id, request.release)
199
+ print(result)
200
+
201
  if "not found" not in result and "Could not" not in result:
202
  return DocResponse(
203
+ doc_id=request.doc_id,
204
  url=result,
205
  search_time=time.time() - start_time
206
  )
 
214
  results = {}
215
  missing = []
216
 
217
+ for doc_id in request.doc_ids:
218
+ finder = TsgDocFinder() if request.doc_id[0].isalpha() else SpecDocFinder()
219
  result = finder.search_document(doc_id)
220
  if "not found" not in result and "Could not" not in result:
221
  results[doc_id] = result
 
230
 
231
  @app.get("/indexed", response_model=List[str])
232
  def get_indexed_documents():
233
+ finder = TsgDocFinder()
234
  return list(finder.indexer.keys())
static/script.js CHANGED
@@ -48,7 +48,7 @@ searchBtn.addEventListener('click', async () => {
48
  headers: {
49
  'Content-Type': 'application/json'
50
  },
51
- body: JSON.stringify({ tsg_doc_id: docId })
52
  });
53
 
54
  const data = await response.json();
@@ -93,7 +93,7 @@ batchSearchBtn.addEventListener('click', async () => {
93
  headers: {
94
  'Content-Type': 'application/json'
95
  },
96
- body: JSON.stringify({ tsg_doc_ids: docIds })
97
  });
98
 
99
  const data = await response.json();
@@ -120,7 +120,7 @@ function displaySingleResult(data) {
120
  resultItem.className = 'result-item';
121
  resultItem.innerHTML = `
122
  <div class="result-header">
123
- <div class="result-id">${data.tsg_doc_id}</div>
124
  <div class="result-status status-found">Found</div>
125
  </div>
126
  <div class="result-url">
 
48
  headers: {
49
  'Content-Type': 'application/json'
50
  },
51
+ body: JSON.stringify({ doc_id: docId, release: null })
52
  });
53
 
54
  const data = await response.json();
 
93
  headers: {
94
  'Content-Type': 'application/json'
95
  },
96
+ body: JSON.stringify({ doc_ids: docIds })
97
  });
98
 
99
  const data = await response.json();
 
120
  resultItem.className = 'result-item';
121
  resultItem.innerHTML = `
122
  <div class="result-header">
123
+ <div class="result-id">${data.doc_id}</div>
124
  <div class="result-status status-found">Found</div>
125
  </div>
126
  <div class="result-url">