Spaces:

OrganizedProgrammers
/

3GPPindexers

Running

om4r932 commited on 1 day ago

Commit

6bceb52

1 Parent(s): db30fd8

Update regex

Files changed (1) hide show

spec_doc_indexer_multi.py CHANGED Viewed

@@ -35,6 +35,8 @@ documents_by_spec_num = {}
 processed_count = 0
 total_count = 0
 def get_text(specification: str, version: str):
     """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
     doc_id = specification
@@ -132,7 +134,7 @@ def get_spec_content(specification: str, version: str):
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
-        m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
         if m and any(line in c for c in text[forewords[0]:forewords[1]]):
             chapters.append(line)
             print(line)

 processed_count = 0
 total_count = 0
+regex = r"^(\d+[a-z]?(?:\.\d+)*)\t[\ \S]+$"
 def get_text(specification: str, version: str):
     """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
     doc_id = specification
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
+        m = re.search(regex, line)
         if m and any(line in c for c in text[forewords[0]:forewords[1]]):
             chapters.append(line)
             print(line)