om4r932 commited on
Commit
6bceb52
·
1 Parent(s): db30fd8

Update regex

Browse files
Files changed (1) hide show
  1. spec_doc_indexer_multi.py +3 -1
spec_doc_indexer_multi.py CHANGED
@@ -35,6 +35,8 @@ documents_by_spec_num = {}
35
  processed_count = 0
36
  total_count = 0
37
 
 
 
38
  def get_text(specification: str, version: str):
39
  """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
40
  doc_id = specification
@@ -132,7 +134,7 @@ def get_spec_content(specification: str, version: str):
132
  chapters = []
133
  for line in toc_brut:
134
  x = line.split("\t")
135
- m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
136
  if m and any(line in c for c in text[forewords[0]:forewords[1]]):
137
  chapters.append(line)
138
  print(line)
 
35
  processed_count = 0
36
  total_count = 0
37
 
38
+ regex = r"^(\d+[a-z]?(?:\.\d+)*)\t[\ \S]+$"
39
+
40
  def get_text(specification: str, version: str):
41
  """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
42
  doc_id = specification
 
134
  chapters = []
135
  for line in toc_brut:
136
  x = line.split("\t")
137
+ m = re.search(regex, line)
138
  if m and any(line in c for c in text[forewords[0]:forewords[1]]):
139
  chapters.append(line)
140
  print(line)