Update chapter extraction method 2
Browse files
app.py
CHANGED
@@ -302,36 +302,28 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict:
|
|
302 |
if len(forewords) >= 2:
|
303 |
break
|
304 |
|
305 |
-
toc_brut = text[forewords[
|
306 |
chapters = []
|
307 |
for line in toc_brut:
|
308 |
x = line.split("\t")
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
|
314 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
315 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
|
316 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
317 |
-
if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
|
318 |
-
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
319 |
|
320 |
real_toc_indexes = {}
|
321 |
|
322 |
for chapter in chapters:
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
break
|
334 |
-
except:
|
335 |
-
real_toc_indexes[chapter] = -float("inf")
|
336 |
|
|
|
337 |
return create_nested_structure(chapters, text, real_toc_indexes)
|
|
|
302 |
if len(forewords) >= 2:
|
303 |
break
|
304 |
|
305 |
+
toc_brut = text[forewords[1]:]
|
306 |
chapters = []
|
307 |
for line in toc_brut:
|
308 |
x = line.split("\t")
|
309 |
+
m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
|
310 |
+
if m and any(line in c for c in text[forewords[0]:forewords[1]]):
|
311 |
+
chapters.append(line)
|
312 |
+
print(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
real_toc_indexes = {}
|
315 |
|
316 |
for chapter in chapters:
|
317 |
+
x = text.index(chapter)
|
318 |
+
real_toc_indexes[chapter] = x
|
319 |
+
|
320 |
+
document = {}
|
321 |
+
toc = list(real_toc_indexes.keys())
|
322 |
+
index_toc = list(real_toc_indexes.values())
|
323 |
+
curr_index = 0
|
324 |
+
for x in range(1, len(toc)):
|
325 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
326 |
+
curr_index = x
|
|
|
|
|
|
|
327 |
|
328 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
329 |
return create_nested_structure(chapters, text, real_toc_indexes)
|