Spaces:

eagle0504
/

document-search-q-series

Running

App Files Files Community

eagle0504 commited on Apr 20, 2024

Commit

cb4becf

1 Parent(s): 6eb23fe

read and textify using token

Browse files

Files changed (1) hide show

helper/utils.py +57 -14

helper/utils.py CHANGED Viewed

@@ -14,26 +14,61 @@ def current_year():
     return now.year
-def read_and_textify(
-    files: List[str],
-) -> Tuple[List[str], List[str]]:
     """
-    Reads PDF files and extracts text from each page.
     This function iterates over a list of uploaded PDF files, extracts text from each page,
-    and compiles a list of texts and corresponding source information.
     Args:
     files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
     Returns:
     Tuple[List[str], List[str]]: A tuple containing two lists:
-        1. A list of strings, where each string is the text extracted from a PDF page.
-        2. A list of strings indicating the source of each text (file name and page number).
     """
-    # Initialize lists to store extracted texts and their sources
-    text_list = []  # List to store extracted text
     sources_list = []  # List to store source information
     # Iterate over each file
@@ -43,13 +78,21 @@ def read_and_textify(
         for i in range(len(pdfReader.pages)):
             pageObj = pdfReader.pages[i]  # Get the page object
             text = pageObj.extract_text()  # Extract text from the page
             pageObj.clear()  # Clear the page object (optional, for memory management)
-            text_list.append(text)  # Add extracted text to the list
-            # Create a source identifier and add it to the list
-            sources_list.append(file.name + "_page_" + str(i))
-    # Return the lists of texts and sources
-    return [text_list, sources_list]
 client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

     return now.year
+# def read_and_textify(
+#     files: List[str],
+# ) -> Tuple[List[str], List[str]]:
+#     """
+#     Reads PDF files and extracts text from each page.
+#     This function iterates over a list of uploaded PDF files, extracts text from each page,
+#     and compiles a list of texts and corresponding source information.
+#     Args:
+#     files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
+#     Returns:
+#     Tuple[List[str], List[str]]: A tuple containing two lists:
+#         1. A list of strings, where each string is the text extracted from a PDF page.
+#         2. A list of strings indicating the source of each text (file name and page number).
+#     """
+#     # Initialize lists to store extracted texts and their sources
+#     text_list = []  # List to store extracted text
+#     sources_list = []  # List to store source information
+#     # Iterate over each file
+#     for file in files:
+#         pdfReader = PyPDF2.PdfReader(file)  # Create a PDF reader object
+#         # Iterate over each page in the PDF
+#         for i in range(len(pdfReader.pages)):
+#             pageObj = pdfReader.pages[i]  # Get the page object
+#             text = pageObj.extract_text()  # Extract text from the page
+#             pageObj.clear()  # Clear the page object (optional, for memory management)
+#             text_list.append(text)  # Add extracted text to the list
+#             # Create a source identifier and add it to the list
+#             sources_list.append(file.name + "_page_" + str(i))
+#     # Return the lists of texts and sources
+#     return [text_list, sources_list]
+def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
     """
+    Reads PDF files and extracts text from each page, breaking the text into segments of about 50 words.
     This function iterates over a list of uploaded PDF files, extracts text from each page,
+    and compiles a list of texts and corresponding source information, segmented into smaller parts.
     Args:
     files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
     Returns:
     Tuple[List[str], List[str]]: A tuple containing two lists:
+        1. A list of strings, where each string is a segment of text extracted from a PDF page.
+        2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
     """
+    text_list = []  # List to store extracted text segments
     sources_list = []  # List to store source information
     # Iterate over each file
         for i in range(len(pdfReader.pages)):
             pageObj = pdfReader.pages[i]  # Get the page object
             text = pageObj.extract_text()  # Extract text from the page
+            if text:
+                # Split text into approximately 50-word chunks
+                words = text.split()
+                for j in range(0, len(words), 50):
+                    chunk = ' '.join(words[j:j+50])
+                    text_list.append(chunk)
+                    # Create a source identifier for each chunk and add it to the list
+                    sources_list.append(f"{file.name}_page_{i}_chunk_{j//50}")
+            else:
+                # If no text extracted, still add a placeholder
+                text_list.append('')
+                sources_list.append(f"{file.name}_page_{i}_chunk_0")
             pageObj.clear()  # Clear the page object (optional, for memory management)
+    return text_list, sources_list
 client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])