Update helper/utils.py
Browse files- helper/utils.py +6 -5
helper/utils.py
CHANGED
|
@@ -58,7 +58,7 @@ import PyPDF2
|
|
| 58 |
|
| 59 |
|
| 60 |
def read_and_textify(
|
| 61 |
-
files: List[str], chunk_size: int =
|
| 62 |
) -> Tuple[List[str], List[str]]:
|
| 63 |
"""
|
| 64 |
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
|
@@ -89,9 +89,9 @@ def read_and_textify(
|
|
| 89 |
text = pageObj.extract_text() # Extract text from the page
|
| 90 |
if text:
|
| 91 |
# Split text into chunks of approximately 'chunk_size' words
|
| 92 |
-
words = text.split()
|
| 93 |
for j in range(0, len(words), chunk_size):
|
| 94 |
-
chunk = " ".join(words[j : j + chunk_size])
|
| 95 |
text_list.append(chunk)
|
| 96 |
# Create a source identifier for each chunk and add it to the list
|
| 97 |
sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
|
|
@@ -237,7 +237,7 @@ def query_search(
|
|
| 237 |
scores = [
|
| 238 |
[
|
| 239 |
sentences[i], # The sentence itself
|
| 240 |
-
query_database[i], # Embedding of the sentence
|
| 241 |
sources[i], # Source of the sentence
|
| 242 |
quantized_influence(
|
| 243 |
prompt_embed_[0], query_database[i], k=levels, use_dagger=False
|
|
@@ -250,7 +250,8 @@ def query_search(
|
|
| 250 |
refs = pd.DataFrame(scores)
|
| 251 |
# Rename columns for clarity
|
| 252 |
refs = refs.rename(
|
| 253 |
-
columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
|
|
|
|
| 254 |
)
|
| 255 |
# Sort the DataFrame based on the 'qim' score in descending order
|
| 256 |
refs = refs.sort_values(by="qim", ascending=False)
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def read_and_textify(
|
| 61 |
+
files: List[str], chunk_size: int = 2 # Default chunk size set to 50
|
| 62 |
) -> Tuple[List[str], List[str]]:
|
| 63 |
"""
|
| 64 |
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
|
|
|
| 89 |
text = pageObj.extract_text() # Extract text from the page
|
| 90 |
if text:
|
| 91 |
# Split text into chunks of approximately 'chunk_size' words
|
| 92 |
+
words = text.split('. ')
|
| 93 |
for j in range(0, len(words), chunk_size):
|
| 94 |
+
chunk = ". ".join(words[j : j + chunk_size]) + '.'
|
| 95 |
text_list.append(chunk)
|
| 96 |
# Create a source identifier for each chunk and add it to the list
|
| 97 |
sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
|
|
|
|
| 237 |
scores = [
|
| 238 |
[
|
| 239 |
sentences[i], # The sentence itself
|
| 240 |
+
# query_database[i], # Embedding of the sentence
|
| 241 |
sources[i], # Source of the sentence
|
| 242 |
quantized_influence(
|
| 243 |
prompt_embed_[0], query_database[i], k=levels, use_dagger=False
|
|
|
|
| 250 |
refs = pd.DataFrame(scores)
|
| 251 |
# Rename columns for clarity
|
| 252 |
refs = refs.rename(
|
| 253 |
+
# columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
|
| 254 |
+
columns={0: "sentences", 1: "page no", 2: "qim"}
|
| 255 |
)
|
| 256 |
# Sort the DataFrame based on the 'qim' score in descending order
|
| 257 |
refs = refs.sort_values(by="qim", ascending=False)
|