Omar ID EL MOUMEN
commited on
Commit
·
3ae60ae
1
Parent(s):
5f91632
Reuse of standard IO method of indexing + update indexer
Browse files- .gitattributes +1 -0
- app.py +18 -40
- indexed_docs.json +3 -8
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -3,8 +3,8 @@ from bs4 import BeautifulSoup
|
|
3 |
import json
|
4 |
import os
|
5 |
import time
|
|
|
6 |
import traceback
|
7 |
-
import psycopg2
|
8 |
from dotenv import load_dotenv
|
9 |
import warnings
|
10 |
from fastapi import FastAPI, HTTPException
|
@@ -56,46 +56,23 @@ class BatchDocResponse(BaseModel):
|
|
56 |
class TsgDocFinder:
|
57 |
def __init__(self):
|
58 |
self.main_ftp_url = "https://www.3gpp.org/ftp"
|
59 |
-
self.
|
60 |
-
self.
|
61 |
-
self.user = os.environ.get("PGSQL_USER")
|
62 |
-
self.password = os.environ.get("PGSQL_PASSWORD")
|
63 |
-
self.database = os.environ.get("PGSQL_DATABASE")
|
64 |
-
self.conn = self.connect()
|
65 |
-
self.indexer = self.load_indexer()
|
66 |
|
67 |
-
def connect(self):
|
68 |
-
"""Établit une connexion à la base de données PostgreSQL"""
|
69 |
-
try:
|
70 |
-
self.conn = psycopg2.connect(
|
71 |
-
host=self.host,
|
72 |
-
port=self.port,
|
73 |
-
user=self.user,
|
74 |
-
password=self.password,
|
75 |
-
dbname=self.database
|
76 |
-
)
|
77 |
-
return self.conn
|
78 |
-
except Exception as e:
|
79 |
-
print(f"Erreur de connexion à la base de données: {e}")
|
80 |
-
return None
|
81 |
-
|
82 |
def load_indexer(self):
|
83 |
"""Load existing index if available"""
|
84 |
-
if
|
85 |
-
self.
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
finally:
|
97 |
-
cursor.close()
|
98 |
-
return dict(sorted(doc.items()))
|
99 |
|
100 |
def get_workgroup(self, doc):
|
101 |
main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
|
@@ -179,6 +156,7 @@ class TsgDocFinder:
|
|
179 |
if doc in file.lower() or original_id in file:
|
180 |
doc_url = f"{zip_url}/{file}"
|
181 |
self.indexer[original_id] = doc_url
|
|
|
182 |
return doc_url
|
183 |
|
184 |
return f"Document {doc_id} not found"
|
@@ -253,9 +231,9 @@ def find_documents_batch(request: BatchDocRequest):
|
|
253 |
missing = []
|
254 |
|
255 |
for doc_id in request.doc_ids:
|
256 |
-
finder =
|
257 |
result = finder.search_document(doc_id)
|
258 |
-
if "not found" not in result and "Could not" not in result:
|
259 |
results[doc_id] = result
|
260 |
else:
|
261 |
missing.append(doc_id)
|
|
|
3 |
import json
|
4 |
import os
|
5 |
import time
|
6 |
+
from datetime import datetime
|
7 |
import traceback
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
import warnings
|
10 |
from fastapi import FastAPI, HTTPException
|
|
|
56 |
class TsgDocFinder:
|
57 |
def __init__(self):
|
58 |
self.main_ftp_url = "https://www.3gpp.org/ftp"
|
59 |
+
self.indexer_file = "indexed_docs.json"
|
60 |
+
self.indexer, self.last_indexer_date = self.load_indexer()
|
|
|
|
|
|
|
|
|
|
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def load_indexer(self):
|
63 |
"""Load existing index if available"""
|
64 |
+
if os.path.exists(self.indexer_file):
|
65 |
+
with open(self.indexer_file, "r", encoding="utf-8") as f:
|
66 |
+
x = json.load(f)
|
67 |
+
return x["docs"], x["last_indexed_date"]
|
68 |
+
return {}, None
|
69 |
+
|
70 |
+
def save_indexer(self):
|
71 |
+
"""Save the updated index"""
|
72 |
+
with open(self.indexer_file, "w", encoding="utf-8") as f:
|
73 |
+
today = datetime.today()
|
74 |
+
output = {"docs": self.indexer, "last_indexed_date": today.strftime("%d/%m/%Y-%H:%M:%S")}
|
75 |
+
json.dump(output, f, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
76 |
|
77 |
def get_workgroup(self, doc):
|
78 |
main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
|
|
|
156 |
if doc in file.lower() or original_id in file:
|
157 |
doc_url = f"{zip_url}/{file}"
|
158 |
self.indexer[original_id] = doc_url
|
159 |
+
self.save_indexer()
|
160 |
return doc_url
|
161 |
|
162 |
return f"Document {doc_id} not found"
|
|
|
231 |
missing = []
|
232 |
|
233 |
for doc_id in request.doc_ids:
|
234 |
+
finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
|
235 |
result = finder.search_document(doc_id)
|
236 |
+
if "not found" not in result and "Could not" not in result and "Unable" not in result:
|
237 |
results[doc_id] = result
|
238 |
else:
|
239 |
missing.append(doc_id)
|
indexed_docs.json
CHANGED
@@ -1,8 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
"SP-000183": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000183.zip",
|
5 |
-
"SP-000184": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000184.zip",
|
6 |
-
"SP-000185": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000185.zip",
|
7 |
-
"SP-090017": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_43/Docs/SP-090017.zip"
|
8 |
-
}
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27de1ffcbc0301e7eab274481928eff7dcf327b2943cbd47b37b531785c433b1
|
3 |
+
size 68386245
|
|
|
|
|
|
|
|
|
|