File size: 3,602 Bytes
8953fd3 ab9f628 c0666b1 8953fd3 ab9f628 f069ded c0666b1 f069ded 8953fd3 3ce84fd 8953fd3 3ce84fd 8953fd3 3ce84fd 8953fd3 c0666b1 8953fd3 c0666b1 659a935 f069ded ab9f628 c0666b1 8953fd3 ab9f628 c0666b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import gradio as gr
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
from transformers import AutoTokenizer, AutoModel
import torch
from pdf2image import convert_from_path
import io
# CSS styles
css = """
.button {
padding: 10px 20px;
background: #007BFF;
color: white;
border: none;
cursor: pointer;
font-size: 16px;
margin: 10px;
}
"""
# Define layout with custom styles
layout = [
gr.Row([gr.File(label="Upload PDF", type="file")]),
gr.Row([gr.Button("Generate Insights")]),
gr.Row([gr.Textbox("Placeholder for PDF insights", label="Insights", type="text")])
]
# Function to get image embeddings using ViT
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)
image = Image.open(image_path)
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to convert PDF to images
def pdf_to_images(pdf_file, img_dir):
images = convert_from_path(pdf_file)
# Create the directory if it doesn't exist
os.makedirs(img_dir, exist_ok=True)
for i, image in enumerate(images):
image_path = f"{img_dir}/page_{i + 1}.png"
image.save(image_path, "PNG")
print(f"Converted {len(images)} pages to images and saved in {img_dir}")
# Function to get text embeddings using a transformer model
def get_text_embeddings(text, model_name='bert-base-uncased'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Function to process PDF and generate a response
def process_pdf_and_generate_response(pdf_file):
try:
# Save the uploaded PDF to a temporary file
tmp_pdf_path = "/tmp/uploaded_file.pdf"
with open(tmp_pdf_path, 'wb') as tmp_pdf:
tmp_pdf.write(pdf_file.read())
# Convert PDF to images
img_dir = "pdf_images"
pdf_to_images(tmp_pdf_path, img_dir)
# Generate embeddings for each image
image_embeddings = []
for filename in os.listdir(img_dir):
if filename.endswith(".png"):
image_path = os.path.join(img_dir, filename)
image_embeddings.append(get_image_embeddings(image_path))
# Perform some text analysis on the PDF content (replace with your logic)
pdf_text = "PDF content analysis placeholder"
text_embeddings = get_text_embeddings(pdf_text)
# Combine image and text embeddings and generate a response (replace with your logic)
combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
response = "Response based on the processed PDF"
except Exception as e:
response = f"An error occurred: {str(e)}"
return response
iface = gr.Interface(
fn=process_pdf_and_generate_response,
inputs=gr.File(label="Upload PDF", type="file"),
outputs=gr.Textbox("Placeholder for PDF insights", label="Insights", type="text"),
title="pdf-chatbot",
description="Upload a PDF and receive insights based on its content.",
css=css
)
if __name__ == "__main__":
iface.launch()
|