File size: 2,816 Bytes
1bd6a74 bf7ece0 624ce0c 1bd6a74 624ce0c 1bd6a74 624ce0c bf7ece0 1bd6a74 624ce0c bf7ece0 1bd6a74 624ce0c bf7ece0 624ce0c bf7ece0 1bd6a74 bf7ece0 1bd6a74 bf7ece0 624ce0c bf7ece0 1bd6a74 bf7ece0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import pandas as pd
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
from PyPDF2 import PdfReader
import docx
from pptx import Presentation
import openpyxl
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load the RAG model
model_name = "facebook/llama-7b-hf"
rag_tokenizer = AutoTokenizer.from_pretrained(model_name)
rag_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Define a function to read text from uploaded documents
def read_text_from_document(file):
if file.name.endswith('.txt'):
text = file.read().decode('utf-8')
elif file.name.endswith('.pdf'):
reader = PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
elif file.name.endswith('.docx'):
doc = docx.Document(file)
text = ''
for para in doc.paragraphs:
text += para.text
elif file.name.endswith('.pptx'):
presentation = Presentation(file)
text = ''
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text
elif file.name.endswith('.xlsx'):
wb = openpyxl.load_workbook(file)
sheet = wb.active
text = ''
for row in sheet.rows:
for cell in row:
text += str(cell.value) + ' '
return text
# Define a function to scrape URL
def scrape_url(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
return text
except Exception as e:
return str(e)
# Define a function to answer questions based on input data using RAG
def answer_questions(data, question):
if data:
inputs = rag_tokenizer.encode("Question: " + question + " Context: " + data, return_tensors="pt")
outputs = rag_model.generate(inputs, max_length=100)
answer = rag_tokenizer.decode(outputs<a href="undefined" target="_blank" className="bg-light-secondary dark:bg-dark-secondary px-1 rounded ml-1 no-underline text-xs text-black/70 dark:text-white/70 relative">0</a>, skip_special_tokens=True)
return answer
else:
return "No data provided"
# Gradio interface
demo = gr.Interface(
fn=lambda data, url, question: answer_questions(read_text_from_document(data) if data else scrape_url(url), question),
inputs=[
gr.File(label="Upload Document (.txt, .pdf, .docx, .pptx, .xlsx)"),
gr.Textbox(label="Enter URL"),
gr.Textbox(label="Ask a question")
],
outputs=gr.Textbox(label="Answer"),
title="RAG Chat",
description="Upload a document or enter a URL and ask a question"
)
# Launch the demo
demo.launch() |