Update chapter extraction method
Browse files
app.py
CHANGED
@@ -160,37 +160,20 @@ def get_spec_content(specification: str, version: str):
|
|
160 |
if len(forewords) >= 2:
|
161 |
break
|
162 |
|
163 |
-
toc_brut = text[forewords[
|
164 |
chapters = []
|
165 |
for line in toc_brut:
|
166 |
x = line.split("\t")
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
|
172 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
173 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
|
174 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
175 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
|
176 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
177 |
|
178 |
real_toc_indexes = {}
|
179 |
|
180 |
for chapter in chapters:
|
181 |
-
|
182 |
-
|
183 |
-
real_toc_indexes[chapter] = x
|
184 |
-
except ValueError as e:
|
185 |
-
try:
|
186 |
-
number = chapter.split("\t")[0] + "\t"
|
187 |
-
for line in text[forewords[1]:]:
|
188 |
-
if number in line:
|
189 |
-
x = text.index(line)
|
190 |
-
real_toc_indexes[line] = x
|
191 |
-
break
|
192 |
-
except:
|
193 |
-
real_toc_indexes[chapter] = -float("inf")
|
194 |
|
195 |
document = {}
|
196 |
toc = list(real_toc_indexes.keys())
|
@@ -200,7 +183,8 @@ def get_spec_content(specification: str, version: str):
|
|
200 |
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
201 |
curr_index = x
|
202 |
|
203 |
-
document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
|
|
204 |
return document
|
205 |
|
206 |
def caseSensitive(string: str, sensitive: bool):
|
|
|
160 |
if len(forewords) >= 2:
|
161 |
break
|
162 |
|
163 |
+
toc_brut = text[forewords[1]:]
|
164 |
chapters = []
|
165 |
for line in toc_brut:
|
166 |
x = line.split("\t")
|
167 |
+
m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
|
168 |
+
if m and any(line in c for c in text[forewords[0]:forewords[1]]):
|
169 |
+
chapters.append(line)
|
170 |
+
print(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
real_toc_indexes = {}
|
173 |
|
174 |
for chapter in chapters:
|
175 |
+
x = text.index(chapter)
|
176 |
+
real_toc_indexes[chapter] = x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
document = {}
|
179 |
toc = list(real_toc_indexes.keys())
|
|
|
183 |
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
184 |
curr_index = x
|
185 |
|
186 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
187 |
+
print(len(toc)-1, toc[curr_index], curr_index)
|
188 |
return document
|
189 |
|
190 |
def caseSensitive(string: str, sensitive: bool):
|