Spaces:

eagle0504
/

document-search-q-series

Running

App Files Files Community

eagle0504 commited on Apr 22, 2024

Commit

1d33079

verified ·

1 Parent(s): db0606a

Update helper/utils.py

Browse files

Files changed (1) hide show

helper/utils.py +6 -5

helper/utils.py CHANGED Viewed

@@ -58,7 +58,7 @@ import PyPDF2
 def read_and_textify(
-    files: List[str], chunk_size: int = 50  # Default chunk size set to 50
 ) -> Tuple[List[str], List[str]]:
     """
     Reads PDF files and extracts text from each page, breaking the text into specified segments.
@@ -89,9 +89,9 @@ def read_and_textify(
             text = pageObj.extract_text()  # Extract text from the page
             if text:
                 # Split text into chunks of approximately 'chunk_size' words
-                words = text.split()
                 for j in range(0, len(words), chunk_size):
-                    chunk = " ".join(words[j : j + chunk_size])
                     text_list.append(chunk)
                     # Create a source identifier for each chunk and add it to the list
                     sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
@@ -237,7 +237,7 @@ def query_search(
     scores = [
         [
             sentences[i],  # The sentence itself
-            query_database[i],  # Embedding of the sentence
             sources[i],  # Source of the sentence
             quantized_influence(
                 prompt_embed_[0], query_database[i], k=levels, use_dagger=False
@@ -250,7 +250,8 @@ def query_search(
     refs = pd.DataFrame(scores)
     # Rename columns for clarity
     refs = refs.rename(
-        columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
     )
     # Sort the DataFrame based on the 'qim' score in descending order
     refs = refs.sort_values(by="qim", ascending=False)

 def read_and_textify(
+    files: List[str], chunk_size: int = 2  # Default chunk size set to 50
 ) -> Tuple[List[str], List[str]]:
     """
     Reads PDF files and extracts text from each page, breaking the text into specified segments.
             text = pageObj.extract_text()  # Extract text from the page
             if text:
                 # Split text into chunks of approximately 'chunk_size' words
+                words = text.split('. ')
                 for j in range(0, len(words), chunk_size):
+                    chunk = ". ".join(words[j : j + chunk_size]) + '.'
                     text_list.append(chunk)
                     # Create a source identifier for each chunk and add it to the list
                     sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
     scores = [
         [
             sentences[i],  # The sentence itself
+            # query_database[i],  # Embedding of the sentence
             sources[i],  # Source of the sentence
             quantized_influence(
                 prompt_embed_[0], query_database[i], k=levels, use_dagger=False
     refs = pd.DataFrame(scores)
     # Rename columns for clarity
     refs = refs.rename(
+        # columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
+        columns={0: "sentences", 1: "page no", 2: "qim"}
     )
     # Sort the DataFrame based on the 'qim' score in descending order
     refs = refs.sort_values(by="qim", ascending=False)