om4r932 commited on
Commit
db30fd8
·
1 Parent(s): f2f17e7

Update chapter extraction method

Browse files
Files changed (1) hide show
  1. spec_doc_indexer_multi.py +12 -27
spec_doc_indexer_multi.py CHANGED
@@ -7,6 +7,7 @@ import requests
7
  import zipfile
8
  import uuid
9
  import os
 
10
  import re
11
  import subprocess
12
  import concurrent.futures
@@ -48,13 +49,13 @@ def get_text(specification: str, version: str):
48
  if response.status_code != 200:
49
  raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
50
 
51
- zip_bytes = BytesIO(response.content)
52
 
53
  with zipfile.ZipFile(zip_bytes) as zf:
54
  for file_name in zf.namelist():
55
  if file_name.endswith("zip"):
56
  print("Another ZIP !")
57
- zip_bytes = BytesIO(zf.read(file_name))
58
  zf = zipfile.ZipFile(zip_bytes)
59
  for file_name2 in zf.namelist():
60
  if file_name2.endswith("doc") or file_name2.endswith("docx"):
@@ -127,37 +128,20 @@ def get_spec_content(specification: str, version: str):
127
  if len(forewords) >= 2:
128
  break
129
 
130
- toc_brut = text[forewords[0]:forewords[1]]
131
  chapters = []
132
  for line in toc_brut:
133
  x = line.split("\t")
134
- if re.search(r"^\d+\t[\ \S]+", line):
135
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
136
- if re.search(r"^\d+\.\d+\t[\ \S]+", line):
137
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
138
- if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
139
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
140
- if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
141
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
142
- if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
143
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
144
 
145
  real_toc_indexes = {}
146
 
147
  for chapter in chapters:
148
- try:
149
- x = text.index(chapter)
150
- real_toc_indexes[chapter] = x
151
- except ValueError as e:
152
- try:
153
- number = chapter.split("\t")[0] + "\t"
154
- for line in text[forewords[1]:]:
155
- if number in line:
156
- x = text.index(line)
157
- real_toc_indexes[line] = x
158
- break
159
- except:
160
- real_toc_indexes[chapter] = -float("inf")
161
 
162
  document = {}
163
  toc = list(real_toc_indexes.keys())
@@ -167,7 +151,8 @@ def get_spec_content(specification: str, version: str):
167
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
168
  curr_index = x
169
 
170
- document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
 
171
  return document
172
 
173
  def process_specification(spec: Dict[str, Any], columns: List[str]) -> None:
 
7
  import zipfile
8
  import uuid
9
  import os
10
+ import io
11
  import re
12
  import subprocess
13
  import concurrent.futures
 
49
  if response.status_code != 200:
50
  raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
51
 
52
+ zip_bytes = io.BytesIO(response.content)
53
 
54
  with zipfile.ZipFile(zip_bytes) as zf:
55
  for file_name in zf.namelist():
56
  if file_name.endswith("zip"):
57
  print("Another ZIP !")
58
+ zip_bytes = io.BytesIO(zf.read(file_name))
59
  zf = zipfile.ZipFile(zip_bytes)
60
  for file_name2 in zf.namelist():
61
  if file_name2.endswith("doc") or file_name2.endswith("docx"):
 
128
  if len(forewords) >= 2:
129
  break
130
 
131
+ toc_brut = text[forewords[1]:]
132
  chapters = []
133
  for line in toc_brut:
134
  x = line.split("\t")
135
+ m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
136
+ if m and any(line in c for c in text[forewords[0]:forewords[1]]):
137
+ chapters.append(line)
138
+ print(line)
 
 
 
 
 
 
139
 
140
  real_toc_indexes = {}
141
 
142
  for chapter in chapters:
143
+ x = text.index(chapter)
144
+ real_toc_indexes[chapter] = x
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  document = {}
147
  toc = list(real_toc_indexes.keys())
 
151
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
152
  curr_index = x
153
 
154
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
155
+ print(len(toc)-1, toc[curr_index], curr_index)
156
  return document
157
 
158
  def process_specification(spec: Dict[str, Any], columns: List[str]) -> None: