read and textify using token
Browse files- helper/utils.py +57 -14
helper/utils.py
CHANGED
|
@@ -14,26 +14,61 @@ def current_year():
|
|
| 14 |
return now.year
|
| 15 |
|
| 16 |
|
| 17 |
-
def read_and_textify(
|
| 18 |
-
|
| 19 |
-
) -> Tuple[List[str], List[str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
-
Reads PDF files and extracts text from each page.
|
| 22 |
|
| 23 |
This function iterates over a list of uploaded PDF files, extracts text from each page,
|
| 24 |
-
and compiles a list of texts and corresponding source information.
|
| 25 |
|
| 26 |
Args:
|
| 27 |
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
| 28 |
|
| 29 |
Returns:
|
| 30 |
Tuple[List[str], List[str]]: A tuple containing two lists:
|
| 31 |
-
1. A list of strings, where each string is
|
| 32 |
-
2. A list of strings indicating the source of each text (file name and
|
| 33 |
"""
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
text_list = [] # List to store extracted text
|
| 37 |
sources_list = [] # List to store source information
|
| 38 |
|
| 39 |
# Iterate over each file
|
|
@@ -43,13 +78,21 @@ def read_and_textify(
|
|
| 43 |
for i in range(len(pdfReader.pages)):
|
| 44 |
pageObj = pdfReader.pages[i] # Get the page object
|
| 45 |
text = pageObj.extract_text() # Extract text from the page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
pageObj.clear() # Clear the page object (optional, for memory management)
|
| 47 |
-
text_list.append(text) # Add extracted text to the list
|
| 48 |
-
# Create a source identifier and add it to the list
|
| 49 |
-
sources_list.append(file.name + "_page_" + str(i))
|
| 50 |
|
| 51 |
-
|
| 52 |
-
return [text_list, sources_list]
|
| 53 |
|
| 54 |
|
| 55 |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
|
|
|
| 14 |
return now.year
|
| 15 |
|
| 16 |
|
| 17 |
+
# def read_and_textify(
|
| 18 |
+
# files: List[str],
|
| 19 |
+
# ) -> Tuple[List[str], List[str]]:
|
| 20 |
+
# """
|
| 21 |
+
# Reads PDF files and extracts text from each page.
|
| 22 |
+
|
| 23 |
+
# This function iterates over a list of uploaded PDF files, extracts text from each page,
|
| 24 |
+
# and compiles a list of texts and corresponding source information.
|
| 25 |
+
|
| 26 |
+
# Args:
|
| 27 |
+
# files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
| 28 |
+
|
| 29 |
+
# Returns:
|
| 30 |
+
# Tuple[List[str], List[str]]: A tuple containing two lists:
|
| 31 |
+
# 1. A list of strings, where each string is the text extracted from a PDF page.
|
| 32 |
+
# 2. A list of strings indicating the source of each text (file name and page number).
|
| 33 |
+
# """
|
| 34 |
+
|
| 35 |
+
# # Initialize lists to store extracted texts and their sources
|
| 36 |
+
# text_list = [] # List to store extracted text
|
| 37 |
+
# sources_list = [] # List to store source information
|
| 38 |
+
|
| 39 |
+
# # Iterate over each file
|
| 40 |
+
# for file in files:
|
| 41 |
+
# pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object
|
| 42 |
+
# # Iterate over each page in the PDF
|
| 43 |
+
# for i in range(len(pdfReader.pages)):
|
| 44 |
+
# pageObj = pdfReader.pages[i] # Get the page object
|
| 45 |
+
# text = pageObj.extract_text() # Extract text from the page
|
| 46 |
+
# pageObj.clear() # Clear the page object (optional, for memory management)
|
| 47 |
+
# text_list.append(text) # Add extracted text to the list
|
| 48 |
+
# # Create a source identifier and add it to the list
|
| 49 |
+
# sources_list.append(file.name + "_page_" + str(i))
|
| 50 |
+
|
| 51 |
+
# # Return the lists of texts and sources
|
| 52 |
+
# return [text_list, sources_list]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
|
| 56 |
"""
|
| 57 |
+
Reads PDF files and extracts text from each page, breaking the text into segments of about 50 words.
|
| 58 |
|
| 59 |
This function iterates over a list of uploaded PDF files, extracts text from each page,
|
| 60 |
+
and compiles a list of texts and corresponding source information, segmented into smaller parts.
|
| 61 |
|
| 62 |
Args:
|
| 63 |
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
Tuple[List[str], List[str]]: A tuple containing two lists:
|
| 67 |
+
1. A list of strings, where each string is a segment of text extracted from a PDF page.
|
| 68 |
+
2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
|
| 69 |
"""
|
| 70 |
|
| 71 |
+
text_list = [] # List to store extracted text segments
|
|
|
|
| 72 |
sources_list = [] # List to store source information
|
| 73 |
|
| 74 |
# Iterate over each file
|
|
|
|
| 78 |
for i in range(len(pdfReader.pages)):
|
| 79 |
pageObj = pdfReader.pages[i] # Get the page object
|
| 80 |
text = pageObj.extract_text() # Extract text from the page
|
| 81 |
+
if text:
|
| 82 |
+
# Split text into approximately 50-word chunks
|
| 83 |
+
words = text.split()
|
| 84 |
+
for j in range(0, len(words), 50):
|
| 85 |
+
chunk = ' '.join(words[j:j+50])
|
| 86 |
+
text_list.append(chunk)
|
| 87 |
+
# Create a source identifier for each chunk and add it to the list
|
| 88 |
+
sources_list.append(f"{file.name}_page_{i}_chunk_{j//50}")
|
| 89 |
+
else:
|
| 90 |
+
# If no text extracted, still add a placeholder
|
| 91 |
+
text_list.append('')
|
| 92 |
+
sources_list.append(f"{file.name}_page_{i}_chunk_0")
|
| 93 |
pageObj.clear() # Clear the page object (optional, for memory management)
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
return text_list, sources_list
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|