om4r932 commited on
Commit
857632f
·
1 Parent(s): cec5f67

Update chapter extraction method

Browse files
Files changed (1) hide show
  1. app.py +8 -25
app.py CHANGED
@@ -260,37 +260,20 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
260
  if len(forewords) >= 2:
261
  break
262
 
263
- toc_brut = text[forewords[0]:forewords[1]]
264
  chapters = []
265
  for line in toc_brut:
266
  x = line.split("\t")
267
- if re.search(r"^\d+\t[\ \S]+", line):
268
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
269
- if re.search(r"^\d+\.\d+\t[\ \S]+", line):
270
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
271
- if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
272
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
273
- if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
274
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
275
- if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
276
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
277
 
278
  real_toc_indexes = {}
279
 
280
  for chapter in chapters:
281
- try:
282
- x = text.index(chapter)
283
- real_toc_indexes[chapter] = x
284
- except ValueError as e:
285
- try:
286
- number = chapter.split("\t")[0] + "\t"
287
- for line in text[forewords[1]:]:
288
- if number in line:
289
- x = text.index(line)
290
- real_toc_indexes[line] = x
291
- break
292
- except:
293
- real_toc_indexes[chapter] = -float("inf")
294
 
295
  document = {}
296
  toc = list(real_toc_indexes.keys())
@@ -300,7 +283,7 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
300
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
301
  curr_index = x
302
 
303
- document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
304
  return document
305
 
306
  @app.post("/online")
 
260
  if len(forewords) >= 2:
261
  break
262
 
263
+ toc_brut = text[forewords[1]:]
264
  chapters = []
265
  for line in toc_brut:
266
  x = line.split("\t")
267
+ m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
268
+ if m and any(line in c for c in text[forewords[0]:forewords[1]]):
269
+ chapters.append(line)
270
+ print(line)
 
 
 
 
 
 
271
 
272
  real_toc_indexes = {}
273
 
274
  for chapter in chapters:
275
+ x = text.index(chapter)
276
+ real_toc_indexes[chapter] = x
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  document = {}
279
  toc = list(real_toc_indexes.keys())
 
283
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
284
  curr_index = x
285
 
286
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
287
  return document
288
 
289
  @app.post("/online")