Spaces:

Compumacy
/

wisalQA_P1

Runtime error

File size: 7,140 Bytes

ea1e6bd

import os
import gradio as gr
from openai import OpenAI
import weaviate
from weaviate.classes.init import Auth
import pypdf  # Replaced PyPDF2
import docx
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from prompt_template import (
    Prompt_template_translation,
    Prompt_template_LLM_Generation,
    Prompt_template_Reranker,
    Prompt_template_Wisal,
    Prompt_template_Halluciations,
    Prompt_template_paraphrasing,
    Prompt_template_Translate_to_original,
    Prompt_template_relevance
)
from query_utils import process_query_for_rewrite, get_non_autism_response
# ─── Configuration ─────────────────────────────────────────────────────────────
# helper functions
GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"

TAVILY_API_KEY="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"

OPENAI_API_KEY="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"

QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"

QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"

OPENAI_API_KEY="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"

WEAVIATE_URL="yorcqe2sqswhcaivxvt9a.c0.us-west3.gcp.weaviate.cloud"

WEAVIATE_API_KEY="d2d0VGdZQTBmdTFlOWdDZl9tT2h3WDVWd1NpT1dQWHdGK0xjR1hYeWxicUxHVnFRazRUSjY2VlRUVlkwPV92MjAw"

DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"

DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"

openai = OpenAI(
    api_key=DEEPINFRA_API_KEY,
    base_url="https://api.deepinfra.com/v1/openai",
)
# Initialize Weaviate client
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    skip_init_checks=True,   # <-- This disables gRPC check

)
# ─── Utility: Extract raw text ──────────────────────────────────────────────────
def extract_text(file_path: str) -> str:
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        text = ""
        with open(file_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    elif ext == ".docx":
        doc = docx.Document(file_path)
        text = "\n".join(p.text for p in doc.paragraphs)
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
    return text
# ─── Chunker & Embed ──────────────────────────────────────────────────────────
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " "],
)
def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]:
    """Embed texts in batches to avoid API limits."""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        resp = openai.embeddings.create(
            model="Qwen/Qwen3-Embedding-8B",
            input=batch,
            encoding_format="float"
        )
        all_embeddings.extend([item.embedding for item in resp.data])
    return all_embeddings
# ─── Ingest & Index ───────────────────────────────────────────────────────────
def ingest_file(file_path: str) -> str:
    raw = extract_text(file_path)
    docs = splitter.split_text(raw)
    texts = [chunk for chunk in docs]
    vectors = embed_texts(texts)
    # Get the collection
    documents = client.collections.get("Books")
    # Batch insert with new API
    with client.batch.dynamic() as batch:
        for txt, vec in zip(texts, vectors):
            batch.add_object(
                collection="Books",
                properties={"text": txt},
                vector=vec
            )
    return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
# ─── Query & Answer ───────────────────────────────────────────────────────────
def answer_question(question: str) -> str:
    # Process query for rewriting and relevance checking
    corrected_query, is_autism_related, rewritten_query = process_query_for_rewrite(question)
    
    # If not autism-related, show direct rejection message
    if not is_autism_related:
        return get_non_autism_response()
    
    # Use the corrected query for retrieval
    q_vec = embed_texts([corrected_query])[0]
    documents = client.collections.get("Books")
    response = documents.query.near_vector(
        near_vector=q_vec,
        limit=5,
        return_metadata=["distance"]
    )
    hits = response.objects
    context = "\n\n".join(hit.properties["text"] for hit in hits)
    print(context)
    wisal_prompt = Prompt_template_Wisal.format(new_query=corrected_query, document=context)
    chat = openai.chat.completions.create(
        model="Qwen/Qwen3-32B",
        messages=[
            {"role": "user", "content": wisal_prompt
            }
        ],
        temperature=0,
        reasoning_effort="none"
    )
    initial_answer = chat.choices[0].message.content
    
    # NEW: Check if the generated answer is sufficiently related to autism
    from query_utils import check_answer_autism_relevance, get_non_autism_answer_response
    
    answer_relevance_score = check_answer_autism_relevance(initial_answer)
    
    # If answer relevance is below 50%, refuse the answer (updated threshold for enhanced scoring)
    if answer_relevance_score < 50:
        return get_non_autism_answer_response()
    
    # If sufficiently autism-related, return the answer
    return initial_answer
# ─── Gradio Interface ─────────────────────────────────────────────────────────
with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
    gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
    with gr.Row():
        up = gr.File(label="Select document")
        btn = gr.Button("Ingest")
        out = gr.Textbox(label="Status", interactive=False)
    btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
    with gr.Row():
        q = gr.Textbox(placeholder="Your question...", lines=2)
        ask = gr.Button("Ask")
        ans = gr.Textbox(label="Answer", lines=6, interactive=False)
    ask.click(fn=answer_question, inputs=q, outputs=ans)
if __name__ == "__main__":
    demo.launch(debug=True)