om4r932 commited on
Commit
b55f83e
·
1 Parent(s): 97e5996

Update chapter extraction method

Browse files
Files changed (1) hide show
  1. app.py +9 -25
app.py CHANGED
@@ -160,37 +160,20 @@ def get_spec_content(specification: str, version: str):
160
  if len(forewords) >= 2:
161
  break
162
 
163
- toc_brut = text[forewords[0]:forewords[1]]
164
  chapters = []
165
  for line in toc_brut:
166
  x = line.split("\t")
167
- if re.search(r"^\d+\t[\ \S]+", line):
168
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
169
- if re.search(r"^\d+\.\d+\t[\ \S]+", line):
170
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
171
- if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
172
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
173
- if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
174
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
175
- if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
176
- chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
177
 
178
  real_toc_indexes = {}
179
 
180
  for chapter in chapters:
181
- try:
182
- x = text.index(chapter)
183
- real_toc_indexes[chapter] = x
184
- except ValueError as e:
185
- try:
186
- number = chapter.split("\t")[0] + "\t"
187
- for line in text[forewords[1]:]:
188
- if number in line:
189
- x = text.index(line)
190
- real_toc_indexes[line] = x
191
- break
192
- except:
193
- real_toc_indexes[chapter] = -float("inf")
194
 
195
  document = {}
196
  toc = list(real_toc_indexes.keys())
@@ -200,7 +183,8 @@ def get_spec_content(specification: str, version: str):
200
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
201
  curr_index = x
202
 
203
- document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
 
204
  return document
205
 
206
  def caseSensitive(string: str, sensitive: bool):
 
160
  if len(forewords) >= 2:
161
  break
162
 
163
+ toc_brut = text[forewords[1]:]
164
  chapters = []
165
  for line in toc_brut:
166
  x = line.split("\t")
167
+ m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
168
+ if m and any(line in c for c in text[forewords[0]:forewords[1]]):
169
+ chapters.append(line)
170
+ print(line)
 
 
 
 
 
 
171
 
172
  real_toc_indexes = {}
173
 
174
  for chapter in chapters:
175
+ x = text.index(chapter)
176
+ real_toc_indexes[chapter] = x
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  document = {}
179
  toc = list(real_toc_indexes.keys())
 
183
  document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
184
  curr_index = x
185
 
186
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
187
+ print(len(toc)-1, toc[curr_index], curr_index)
188
  return document
189
 
190
  def caseSensitive(string: str, sensitive: bool):