Spaces:

Penality
/

pdf-something

Sleeping

File size: 2,391 Bytes

fe36699
cf10f44
 
92757b3
 
cf10f44
fe36699
cf10f44
05fe66e
fe36699
 
cf10f44
 
 
 
 
 
 
fe36699
cf10f44
 
 
 
 
 
 
 
 
fe36699
cf10f44
 
 
 
 
 
fe36699
cf10f44
 
 
 
 
 
 
 
 
 
 
 
fe36699
cf10f44
fe36699
cf10f44
 
b22305d
cf10f44
 
fe36699
cf10f44
 
fe36699
cf10f44
 
 
 
 
 
fe36699
 
cf10f44
5f6bee3

import gradio as gr
import pdfplumber
import openai
import re
import unicodedata
import os

# Set up OpenAI API Key (Replace with your actual key)
openai.api_key = "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"


def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for faster processing."""
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    
    # Step 1: Extract text from the PDF
    text = extract_text_from_pdf(pdf_file)
    
    # Step 2: Split into chunks
    chunks = split_text(text)
    
    # Step 3: Use only the first chunk for now (to reduce token usage)
    if not chunks:
        return "Could not extract any text from the PDF."

    prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"

    # Step 4: Send to OpenAI's GPT-3.5
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    # Step 5: Return the chatbot's response
    return response["choices"][0]["message"]["content"]

# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot"
)

# Launch Gradio app
iface.launch()