Omar ID EL MOUMEN commited on
Commit
3ae60ae
·
1 Parent(s): 5f91632

Reuse of standard IO method of indexing + update indexer

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +18 -40
  3. indexed_docs.json +3 -8
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -3,8 +3,8 @@ from bs4 import BeautifulSoup
3
  import json
4
  import os
5
  import time
 
6
  import traceback
7
- import psycopg2
8
  from dotenv import load_dotenv
9
  import warnings
10
  from fastapi import FastAPI, HTTPException
@@ -56,46 +56,23 @@ class BatchDocResponse(BaseModel):
56
  class TsgDocFinder:
57
  def __init__(self):
58
  self.main_ftp_url = "https://www.3gpp.org/ftp"
59
- self.host = os.environ.get("PGSQL_HOST")
60
- self.port = os.environ.get("PGSQL_PORT")
61
- self.user = os.environ.get("PGSQL_USER")
62
- self.password = os.environ.get("PGSQL_PASSWORD")
63
- self.database = os.environ.get("PGSQL_DATABASE")
64
- self.conn = self.connect()
65
- self.indexer = self.load_indexer()
66
 
67
- def connect(self):
68
- """Établit une connexion à la base de données PostgreSQL"""
69
- try:
70
- self.conn = psycopg2.connect(
71
- host=self.host,
72
- port=self.port,
73
- user=self.user,
74
- password=self.password,
75
- dbname=self.database
76
- )
77
- return self.conn
78
- except Exception as e:
79
- print(f"Erreur de connexion à la base de données: {e}")
80
- return None
81
-
82
  def load_indexer(self):
83
  """Load existing index if available"""
84
- if not self.conn:
85
- self.conn = self.connect()
86
- if self.conn is None:
87
- raise HTTPException(status_code=500, detail="Connexion à la base de donnée impossible")
88
-
89
- cursor = self.conn.cursor()
90
- try:
91
- cursor.execute("SELECT doc_id, url FROM document_position")
92
- rows = cursor.fetchall()
93
- doc = {doc_id: url for doc_id, url in rows}
94
- except:
95
- raise HTTPException(status_code=500, detail="Erreur lors de la récupération")
96
- finally:
97
- cursor.close()
98
- return dict(sorted(doc.items()))
99
 
100
  def get_workgroup(self, doc):
101
  main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
@@ -179,6 +156,7 @@ class TsgDocFinder:
179
  if doc in file.lower() or original_id in file:
180
  doc_url = f"{zip_url}/{file}"
181
  self.indexer[original_id] = doc_url
 
182
  return doc_url
183
 
184
  return f"Document {doc_id} not found"
@@ -253,9 +231,9 @@ def find_documents_batch(request: BatchDocRequest):
253
  missing = []
254
 
255
  for doc_id in request.doc_ids:
256
- finder = TsgDocFinder() if doc_id[0].isalpha() else SpecDocFinder()
257
  result = finder.search_document(doc_id)
258
- if "not found" not in result and "Could not" not in result:
259
  results[doc_id] = result
260
  else:
261
  missing.append(doc_id)
 
3
  import json
4
  import os
5
  import time
6
+ from datetime import datetime
7
  import traceback
 
8
  from dotenv import load_dotenv
9
  import warnings
10
  from fastapi import FastAPI, HTTPException
 
56
  class TsgDocFinder:
57
  def __init__(self):
58
  self.main_ftp_url = "https://www.3gpp.org/ftp"
59
+ self.indexer_file = "indexed_docs.json"
60
+ self.indexer, self.last_indexer_date = self.load_indexer()
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def load_indexer(self):
63
  """Load existing index if available"""
64
+ if os.path.exists(self.indexer_file):
65
+ with open(self.indexer_file, "r", encoding="utf-8") as f:
66
+ x = json.load(f)
67
+ return x["docs"], x["last_indexed_date"]
68
+ return {}, None
69
+
70
+ def save_indexer(self):
71
+ """Save the updated index"""
72
+ with open(self.indexer_file, "w", encoding="utf-8") as f:
73
+ today = datetime.today()
74
+ output = {"docs": self.indexer, "last_indexed_date": today.strftime("%d/%m/%Y-%H:%M:%S")}
75
+ json.dump(output, f, indent=4, ensure_ascii=False)
 
 
 
76
 
77
  def get_workgroup(self, doc):
78
  main_tsg = "tsg_ct" if doc[0] == "C" else "tsg_sa" if doc[0] == "S" else None
 
156
  if doc in file.lower() or original_id in file:
157
  doc_url = f"{zip_url}/{file}"
158
  self.indexer[original_id] = doc_url
159
+ self.save_indexer()
160
  return doc_url
161
 
162
  return f"Document {doc_id} not found"
 
231
  missing = []
232
 
233
  for doc_id in request.doc_ids:
234
+ finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
235
  result = finder.search_document(doc_id)
236
+ if "not found" not in result and "Could not" not in result and "Unable" not in result:
237
  results[doc_id] = result
238
  else:
239
  missing.append(doc_id)
indexed_docs.json CHANGED
@@ -1,8 +1,3 @@
1
- {
2
- "S4-110084": "https://www.3gpp.org/ftp/tsg_sa/WG4_CODEC/TSGS4_62/Docs/S4-110084.zip",
3
- "SP-000182": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000182.zip",
4
- "SP-000183": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000183.zip",
5
- "SP-000184": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000184.zip",
6
- "SP-000185": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_08/Docs/ZIP/SP-000185.zip",
7
- "SP-090017": "https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_43/Docs/SP-090017.zip"
8
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27de1ffcbc0301e7eab274481928eff7dcf327b2943cbd47b37b531785c433b1
3
+ size 68386245