Spaces:
Runtime error
Runtime error
Commit
·
bb33f0e
1
Parent(s):
632bc83
Update app.py
Browse files
app.py
CHANGED
@@ -33,12 +33,17 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
33 |
doc.close()
|
34 |
return text_list
|
35 |
|
36 |
-
def text_to_chunks(texts, word_length=150, start_page=1):
|
37 |
text_toks = [t.split(' ') for t in texts]
|
38 |
page_nums = []
|
39 |
chunks = []
|
40 |
|
|
|
|
|
41 |
for idx, words in enumerate(text_toks):
|
|
|
|
|
|
|
42 |
for i in range(0, len(words), word_length):
|
43 |
chunk = words[i:i+word_length]
|
44 |
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
|
@@ -46,10 +51,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
46 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
47 |
continue
|
48 |
chunk = ' '.join(chunk).strip()
|
49 |
-
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
50 |
chunks.append(chunk)
|
51 |
return chunks
|
52 |
|
|
|
53 |
class SemanticSearch:
|
54 |
|
55 |
def __init__(self):
|
|
|
33 |
doc.close()
|
34 |
return text_list
|
35 |
|
36 |
+
def text_to_chunks(texts, file_names, word_length=150, start_page=1):
|
37 |
text_toks = [t.split(' ') for t in texts]
|
38 |
page_nums = []
|
39 |
chunks = []
|
40 |
|
41 |
+
current_file_idx = 0
|
42 |
+
|
43 |
for idx, words in enumerate(text_toks):
|
44 |
+
if idx > 0 and idx % len(file_names) == 0:
|
45 |
+
current_file_idx += 1
|
46 |
+
|
47 |
for i in range(0, len(words), word_length):
|
48 |
chunk = words[i:i+word_length]
|
49 |
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
|
|
|
51 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
52 |
continue
|
53 |
chunk = ' '.join(chunk).strip()
|
54 |
+
chunk = f'[{file_names[current_file_idx]}, Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
55 |
chunks.append(chunk)
|
56 |
return chunks
|
57 |
|
58 |
+
|
59 |
class SemanticSearch:
|
60 |
|
61 |
def __init__(self):
|