|
import os |
|
import gradio as gr |
|
from langchain.document_loaders import PyPDFLoader, YoutubeLoader, TextLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from langchain.chains import RetrievalQA |
|
from langchain.chat_models import init_chat_model |
|
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("openai") |
|
if not OPENAI_API_KEY: |
|
raise ValueError("β OPENAI API Key not found. Please add it in Hugging Face secrets as 'OPENAI_API_KEY' or 'openai'.") |
|
|
|
|
|
def process_inputs(pdf_file, youtube_url, txt_file, query): |
|
docs = [] |
|
|
|
|
|
try: |
|
pdf_path = pdf_file.name |
|
pdf_loader = PyPDFLoader(pdf_path) |
|
docs.extend(pdf_loader.load()) |
|
except Exception as e: |
|
return f"β Failed to load PDF: {e}" |
|
|
|
|
|
yt_loaded = False |
|
if youtube_url: |
|
try: |
|
yt_loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False) |
|
docs.extend(yt_loader.load()) |
|
yt_loaded = True |
|
except Exception as e: |
|
print(f"β οΈ YouTube transcript not loaded: {e}") |
|
|
|
|
|
if not yt_loaded and txt_file is not None: |
|
try: |
|
txt_path = txt_file.name |
|
txt_loader = TextLoader(txt_path) |
|
docs.extend(txt_loader.load()) |
|
except Exception as e: |
|
return f"β Failed to load transcript file: {e}" |
|
|
|
if not docs: |
|
return "β No documents could be loaded. Please check your inputs." |
|
|
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) |
|
splits = splitter.split_documents(docs) |
|
|
|
|
|
embedding = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY) |
|
db = FAISS.from_documents(splits, embedding) |
|
|
|
|
|
llm = init_chat_model("gpt-4o-mini", model_provider="openai", api_key=OPENAI_API_KEY) |
|
qa = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever()) |
|
|
|
try: |
|
result = qa.invoke({"query": query}) |
|
return result["result"] |
|
except Exception as e: |
|
return f"β Retrieval failed: {e}" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## π Ask Questions from PDF + YouTube Transcript or .txt Upload") |
|
|
|
with gr.Row(): |
|
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) |
|
yt_input = gr.Textbox(label="YouTube URL (Optional)", placeholder="https://www.youtube.com/watch?v=...") |
|
txt_input = gr.File(label="Upload Transcript .txt (Optional fallback)", file_types=[".txt"]) |
|
|
|
query_input = gr.Textbox(label="Your Question", placeholder="e.g., What did the document say about X?") |
|
output = gr.Textbox(label="Answer") |
|
|
|
run_button = gr.Button("Get Answer") |
|
run_button.click(fn=process_inputs, inputs=[pdf_input, yt_input, txt_input, query_input], outputs=output) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|