Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

File size: 3,168 Bytes

fbfbbd7
afb405a
0881f45
fbfbbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0881f45
fbfbbd7
afb405a
 
 
 
 
 
 
0881f45
afb405a
0881f45
 
 
afb405a
0881f45
 
 
 
 
 
 
 
 
 
afb405a
0881f45
 
 
 
 
 
 
 
afb405a
0881f45
 
afb405a
0881f45
 
 
 
 
 
 
 
 
 
 
afb405a
0881f45
 
 
afb405a

from typing import List, Dict
import os
import pypdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

class PDFProcessor:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def process_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Process a PDF file and return chunks of text with metadata.
        
        Args:
            pdf_path (str): Path to the PDF file
            
        Returns:
            List[Dict]: List of text chunks with metadata
        """
        print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
        
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
            
        print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
        
        try:
            print("Attempting to use PyPDFLoader...")
            # Try using PyPDFLoader from langchain
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
            print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
            
            # Split the text into chunks
            chunks = []
            for page in pages:
                page_chunks = self.text_splitter.split_text(page.page_content)
                for chunk in page_chunks:
                    chunks.append({
                        'text': chunk,
                        'metadata': {'page': page.metadata['page']}
                    })
            print(f"Created {len(chunks)} chunks from PyPDFLoader method")
            return chunks
            
        except Exception as e:
            print(f"Error with PyPDFLoader: {str(e)}")
            print("Trying alternative PDF processing method...")
            
            # Fallback to direct pypdf usage
            try:
                print("Attempting to use pypdf directly...")
                with open(pdf_path, 'rb') as file:
                    pdf = pypdf.PdfReader(file)
                    print(f"Successfully opened PDF with {len(pdf.pages)} pages")
                    chunks = []
                    
                    for page_num in range(len(pdf.pages)):
                        text = pdf.pages[page_num].extract_text()
                        page_chunks = self.text_splitter.split_text(text)
                        
                        for chunk in page_chunks:
                            chunks.append({
                                'text': chunk,
                                'metadata': {'page': page_num + 1}
                            })
                    print(f"Created {len(chunks)} chunks from direct pypdf method")
                    return chunks
                    
            except Exception as e2:
                error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
                print(error_msg)
                raise Exception(error_msg)