Spaces:
Sleeping
Sleeping
File size: 4,428 Bytes
6ef000b 325521f 6ef000b 325521f 6ef000b 325521f 6ef000b 325521f 6ef000b 325521f 6ef000b 325521f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import streamlit as st
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from collections import deque
import time
import numpy as np
# Crawling function
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) :
visited = set()
results = []
queue = deque([(start_url, 0)])
crawled_urls = []
while queue:
url, depth = queue.popleft()
if depth > max_depth or url in visited:
continue
visited.add(url)
crawled_urls.append(url)
try:
time.sleep(delay)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip()
results.append((url, text))
if depth < max_depth:
for link in soup.find_all('a', href=True):
next_url = urljoin(url, link['href'])
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
queue.append((next_url, depth + 1))
if len(queue) > 10:
break
except Exception as e:
print(f"Error crawling {url}: {e}")
return results, crawled_urls
# Text chunking function
def chunk_text(text: str, max_chunk_size: int = 1000) :
chunks = []
current_chunk = ""
for sentence in re.split(r'(?<=[.!?])\s+', text):
if len(current_chunk) + len(sentence) <= max_chunk_size:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Streamlit UI
st.title("CUDA Documentation QA System")
# Initialize global variables
if 'vector_store' not in st.session_state:
st.session_state.vector_store = None
if 'documents_loaded' not in st.session_state:
st.session_state.documents_loaded = False
# Crawling and processing the data
if st.button('Crawl CUDA Documentation'):
with st.spinner('Crawling CUDA documentation...'):
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
st.write(f"Processed {len(crawled_data)} pages.")
texts = []
for url, text in crawled_data:
chunks = chunk_text(text, max_chunk_size=1024)
texts.extend(chunks)
st.success("Crawling and processing completed.")
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'})
# Store embeddings in FAISS
st.session_state.vector_store = FAISS.from_texts(texts, embeddings)
st.session_state.documents_loaded = True
st.write("Embeddings stored in FAISS.")
# Asking questions
query = st.text_input("Enter your question about CUDA:")
if query and st.session_state.documents_loaded:
with st.spinner('Searching for an answer...'):
# Initialize Google Generative AI
llm = GoogleGenerativeAI(model='gemini-1.0-pro', google_api_key="AIzaSyC1AvHnvobbycU8XSCXh-gRq3DUfG0EP98")
# Create a PromptTemplate for the QA chain
qa_prompt = PromptTemplate(
template="Answer the following question based on the context provided:\n\n{context}\n\nQuestion: {question}\nAnswer:",
input_variables=["context", "question"])
# Create the retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
chain_type="map_rerank",
retriever=st.session_state.vector_store.as_retriever(),
combine_documents_chain=qa_prompt,
llm=llm
)
response = qa_chain({"question": query})
st.write("**Answer:**")
st.write(response['answer'])
st.write("**Source:**")
st.write(response['source'])
elif query:
st.warning("Please crawl the CUDA documentation first.")
|