Spaces:

OrganizedProgrammers
/

3GPPDocFinder

Running

App Files Files Community

om4r932 commited on 3 days ago

Commit

b55f83e

1 Parent(s): 97e5996

Update chapter extraction method

Browse files

Files changed (1) hide show

app.py +9 -25

app.py CHANGED Viewed

@@ -160,37 +160,20 @@ def get_spec_content(specification: str, version: str):
         if len(forewords) >= 2:
             break
-    toc_brut = text[forewords[0]:forewords[1]]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
-        if re.search(r"^\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
     real_toc_indexes = {}
     for chapter in chapters:
-        try:
-            x = text.index(chapter)
-            real_toc_indexes[chapter] = x
-        except ValueError as e:
-            try:
-                number = chapter.split("\t")[0] + "\t"
-                for line in text[forewords[1]:]:
-                    if number in line:
-                        x = text.index(line)
-                        real_toc_indexes[line] = x
-                        break
-            except:
-                real_toc_indexes[chapter] = -float("inf")
     document = {}
     toc = list(real_toc_indexes.keys())
@@ -200,7 +183,8 @@ def get_spec_content(specification: str, version: str):
         document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
         curr_index = x
-    document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
     return document
 def caseSensitive(string: str, sensitive: bool):

         if len(forewords) >= 2:
             break
+    toc_brut = text[forewords[1]:]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
+        m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
+        if m and any(line in c for c in text[forewords[0]:forewords[1]]):
+            chapters.append(line)
+            print(line)
     real_toc_indexes = {}
     for chapter in chapters:
+        x = text.index(chapter)
+        real_toc_indexes[chapter] = x
     document = {}
     toc = list(real_toc_indexes.keys())
         document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
         curr_index = x
+    document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
+    print(len(toc)-1, toc[curr_index], curr_index)
     return document
 def caseSensitive(string: str, sensitive: bool):