om4r932 commited on
Commit
00d0e4e
·
1 Parent(s): 857632f

Update chapter extraction method 2

Browse files
Files changed (1) hide show
  1. app.py +16 -24
app.py CHANGED
@@ -302,36 +302,28 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict:
302
  if len(forewords) >= 2:
303
  break
304
 
305
- toc_brut = text[forewords[0]:forewords[1]]
306
  chapters = []
307
  for line in toc_brut:
308
  x = line.split("\t")
309
- if re.search(r"^\d+\t[\ \S]+", line):
310
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
311
- if re.search(r"^\d+\.\d+\t[\ \S]+", line):
312
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
313
- if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
314
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
315
- if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
316
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
317
- if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
318
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
319
 
320
  real_toc_indexes = {}
321
 
322
  for chapter in chapters:
323
- try:
324
- x = text.index(chapter)
325
- real_toc_indexes[chapter] = x
326
- except ValueError as e:
327
- try:
328
- number = chapter.split("\t")[0] + "\t"
329
- for line in text[forewords[1]:]:
330
- if number in line:
331
- x = text.index(line)
332
- real_toc_indexes[line] = x
333
- break
334
- except:
335
- real_toc_indexes[chapter] = -float("inf")
336
 
 
337
  return create_nested_structure(chapters, text, real_toc_indexes)
 
302
  if len(forewords) >= 2:
303
  break
304
 
305
+ toc_brut = text[forewords[1]:]
306
  chapters = []
307
  for line in toc_brut:
308
  x = line.split("\t")
309
+ m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
310
+ if m and any(line in c for c in text[forewords[0]:forewords[1]]):
311
+ chapters.append(line)
312
+ print(line)
 
 
 
 
 
 
313
 
314
  real_toc_indexes = {}
315
 
316
  for chapter in chapters:
317
+ x = text.index(chapter)
318
+ real_toc_indexes[chapter] = x
319
+
320
+ document = {}
321
+ toc = list(real_toc_indexes.keys())
322
+ index_toc = list(real_toc_indexes.values())
323
+ curr_index = 0
324
+ for x in range(1, len(toc)):
325
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
326
+ curr_index = x
 
 
 
327
 
328
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
329
  return create_nested_structure(chapters, text, real_toc_indexes)