Update regex
Browse files
spec_doc_indexer_multi.py
CHANGED
@@ -35,6 +35,8 @@ documents_by_spec_num = {}
|
|
35 |
processed_count = 0
|
36 |
total_count = 0
|
37 |
|
|
|
|
|
38 |
def get_text(specification: str, version: str):
|
39 |
"""Récupère les bytes du PDF à partir d'une spécification et d'une version."""
|
40 |
doc_id = specification
|
@@ -132,7 +134,7 @@ def get_spec_content(specification: str, version: str):
|
|
132 |
chapters = []
|
133 |
for line in toc_brut:
|
134 |
x = line.split("\t")
|
135 |
-
m = re.search(
|
136 |
if m and any(line in c for c in text[forewords[0]:forewords[1]]):
|
137 |
chapters.append(line)
|
138 |
print(line)
|
|
|
35 |
processed_count = 0
|
36 |
total_count = 0
|
37 |
|
38 |
+
regex = r"^(\d+[a-z]?(?:\.\d+)*)\t[\ \S]+$"
|
39 |
+
|
40 |
def get_text(specification: str, version: str):
|
41 |
"""Récupère les bytes du PDF à partir d'une spécification et d'une version."""
|
42 |
doc_id = specification
|
|
|
134 |
chapters = []
|
135 |
for line in toc_brut:
|
136 |
x = line.split("\t")
|
137 |
+
m = re.search(regex, line)
|
138 |
if m and any(line in c for c in text[forewords[0]:forewords[1]]):
|
139 |
chapters.append(line)
|
140 |
print(line)
|