bhaskartripathi commited on
Commit
bb33f0e
·
1 Parent(s): 632bc83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -33,12 +33,17 @@ def pdf_to_text(path, start_page=1, end_page=None):
33
  doc.close()
34
  return text_list
35
 
36
- def text_to_chunks(texts, word_length=150, start_page=1):
37
  text_toks = [t.split(' ') for t in texts]
38
  page_nums = []
39
  chunks = []
40
 
 
 
41
  for idx, words in enumerate(text_toks):
 
 
 
42
  for i in range(0, len(words), word_length):
43
  chunk = words[i:i+word_length]
44
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
@@ -46,10 +51,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
46
  text_toks[idx+1] = chunk + text_toks[idx+1]
47
  continue
48
  chunk = ' '.join(chunk).strip()
49
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
50
  chunks.append(chunk)
51
  return chunks
52
 
 
53
  class SemanticSearch:
54
 
55
  def __init__(self):
 
33
  doc.close()
34
  return text_list
35
 
36
+ def text_to_chunks(texts, file_names, word_length=150, start_page=1):
37
  text_toks = [t.split(' ') for t in texts]
38
  page_nums = []
39
  chunks = []
40
 
41
+ current_file_idx = 0
42
+
43
  for idx, words in enumerate(text_toks):
44
+ if idx > 0 and idx % len(file_names) == 0:
45
+ current_file_idx += 1
46
+
47
  for i in range(0, len(words), word_length):
48
  chunk = words[i:i+word_length]
49
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
 
51
  text_toks[idx+1] = chunk + text_toks[idx+1]
52
  continue
53
  chunk = ' '.join(chunk).strip()
54
+ chunk = f'[{file_names[current_file_idx]}, Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
55
  chunks.append(chunk)
56
  return chunks
57
 
58
+
59
  class SemanticSearch:
60
 
61
  def __init__(self):