Spaces:

OrganizedProgrammers
/

3GPPSpecSplitter

Running

App Files Files Community

om4r932 commited on May 30

Commit

00d0e4e

1 Parent(s): 857632f

Update chapter extraction method 2

Browse files

Files changed (1) hide show

app.py +16 -24

app.py CHANGED Viewed

@@ -302,36 +302,28 @@ def get_file_from_spec_id_version(req: SpecRequest) -> Dict:
         if len(forewords) >= 2:
             break
-    toc_brut = text[forewords[0]:forewords[1]]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
-        if re.search(r"^\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
     real_toc_indexes = {}
     for chapter in chapters:
-        try:
-            x = text.index(chapter)
-            real_toc_indexes[chapter] = x
-        except ValueError as e:
-            try:
-                number = chapter.split("\t")[0] + "\t"
-                for line in text[forewords[1]:]:
-                    if number in line:
-                        x = text.index(line)
-                        real_toc_indexes[line] = x
-                        break
-            except:
-                real_toc_indexes[chapter] = -float("inf")
     return create_nested_structure(chapters, text, real_toc_indexes)

         if len(forewords) >= 2:
             break
+    toc_brut = text[forewords[1]:]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
+        m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
+        if m and any(line in c for c in text[forewords[0]:forewords[1]]):
+            chapters.append(line)
+            print(line)
     real_toc_indexes = {}
     for chapter in chapters:
+        x = text.index(chapter)
+        real_toc_indexes[chapter] = x
+    document = {}
+    toc = list(real_toc_indexes.keys())
+    index_toc = list(real_toc_indexes.values())
+    curr_index = 0
+    for x in range(1, len(toc)):
+        document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
+        curr_index = x
+    document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
     return create_nested_structure(chapters, text, real_toc_indexes)