Update chapter extraction method
Browse files
app.py
CHANGED
@@ -260,37 +260,20 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
|
|
260 |
if len(forewords) >= 2:
|
261 |
break
|
262 |
|
263 |
-
toc_brut = text[forewords[
|
264 |
chapters = []
|
265 |
for line in toc_brut:
|
266 |
x = line.split("\t")
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
|
272 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
273 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
|
274 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
275 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
|
276 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
277 |
|
278 |
real_toc_indexes = {}
|
279 |
|
280 |
for chapter in chapters:
|
281 |
-
|
282 |
-
|
283 |
-
real_toc_indexes[chapter] = x
|
284 |
-
except ValueError as e:
|
285 |
-
try:
|
286 |
-
number = chapter.split("\t")[0] + "\t"
|
287 |
-
for line in text[forewords[1]:]:
|
288 |
-
if number in line:
|
289 |
-
x = text.index(line)
|
290 |
-
real_toc_indexes[line] = x
|
291 |
-
break
|
292 |
-
except:
|
293 |
-
real_toc_indexes[chapter] = -float("inf")
|
294 |
|
295 |
document = {}
|
296 |
toc = list(real_toc_indexes.keys())
|
@@ -300,7 +283,7 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
|
|
300 |
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
301 |
curr_index = x
|
302 |
|
303 |
-
document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
304 |
return document
|
305 |
|
306 |
@app.post("/online")
|
|
|
260 |
if len(forewords) >= 2:
|
261 |
break
|
262 |
|
263 |
+
toc_brut = text[forewords[1]:]
|
264 |
chapters = []
|
265 |
for line in toc_brut:
|
266 |
x = line.split("\t")
|
267 |
+
m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
|
268 |
+
if m and any(line in c for c in text[forewords[0]:forewords[1]]):
|
269 |
+
chapters.append(line)
|
270 |
+
print(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
real_toc_indexes = {}
|
273 |
|
274 |
for chapter in chapters:
|
275 |
+
x = text.index(chapter)
|
276 |
+
real_toc_indexes[chapter] = x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
document = {}
|
279 |
toc = list(real_toc_indexes.keys())
|
|
|
283 |
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
284 |
curr_index = x
|
285 |
|
286 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
287 |
return document
|
288 |
|
289 |
@app.post("/online")
|