Spaces:

bhaskartripathi
/

pdfChatterSandbox

Runtime error

bhaskartripathi commited on May 5, 2023

Commit

7ff879d

1 Parent(s): 7f9e142

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -38,11 +38,18 @@ def text_to_chunks(texts, file_names, word_length=150, start_page=1):
     page_nums = []
     chunks = []
-    current_file_idx = 0
     for idx, words in enumerate(text_toks):
-        if idx > 0 and idx % len(file_names) == 0:
-            current_file_idx += 1
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
@@ -51,11 +58,12 @@ def text_to_chunks(texts, file_names, word_length=150, start_page=1):
                 text_toks[idx+1] = chunk + text_toks[idx+1]
                 continue
             chunk = ' '.join(chunk).strip()
-            chunk = f'[{file_names[current_file_idx]}, Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
     return chunks
 class SemanticSearch:
     def __init__(self):
@@ -91,18 +99,19 @@ class SemanticSearch:
 def load_recommender(paths, start_page=1):
     global recommender
     all_texts = []
-    file_names = []
     for path in paths:
         texts = pdf_to_text(path, start_page=start_page)
         all_texts.extend(texts)
-        file_names.append(os.path.basename(path))
     chunks = text_to_chunks(all_texts, file_names, start_page=start_page)
     recommender.fit(chunks)
     return 'Corpus Loaded.'
 def generate_text(openAI_key, prompt, engine="text-davinci-003"):
     openai.api_key = openAI_key

     page_nums = []
     chunks = []
+    total_pages = len(texts)
     for idx, words in enumerate(text_toks):
+        current_file_idx = 0
+        current_page = idx + start_page
+        for i, num_pages in enumerate(file_names.values()):
+            if current_page > num_pages:
+                current_page -= num_pages
+                current_file_idx += 1
+            else:
+                break
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
                 text_toks[idx+1] = chunk + text_toks[idx+1]
                 continue
             chunk = ' '.join(chunk).strip()
+            chunk = f'[{list(file_names.keys())[current_file_idx]}, Page no. {current_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
     return chunks
 class SemanticSearch:
     def __init__(self):
 def load_recommender(paths, start_page=1):
     global recommender
     all_texts = []
+    file_names = {}
     for path in paths:
         texts = pdf_to_text(path, start_page=start_page)
         all_texts.extend(texts)
+        file_names[os.path.basename(path)] = len(texts)
     chunks = text_to_chunks(all_texts, file_names, start_page=start_page)
     recommender.fit(chunks)
     return 'Corpus Loaded.'
 def generate_text(openAI_key, prompt, engine="text-davinci-003"):
     openai.api_key = openAI_key