Spaces:

AdrienB134
/

matriv-rag-demo

Sleeping

App Files Files Community

AdrienB134 commited on Dec 6, 2024

Commit

9d34725

verified ·

1 Parent(s): 5807d3e

Delete rag_demo

Browse files

Files changed (50) hide show

rag_demo/__init__.py +0 -3
rag_demo/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/__pycache__/pipeline.cpython-311.pyc +0 -0
rag_demo/__pycache__/settings.cpython-311.pyc +0 -0
rag_demo/app.py +0 -81
rag_demo/data/test.pdf +0 -0
rag_demo/data/test2.pdf +0 -3
rag_demo/infra/__pycache__/qdrant.cpython-311.pyc +0 -0
rag_demo/infra/qdrant.py +0 -25
rag_demo/pipeline.py +0 -13
rag_demo/preprocessing/__init__.py +0 -5
rag_demo/preprocessing/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/chunking.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/embed.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/load_to_vectordb.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/pdf_conversion.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__init__.py +0 -12
rag_demo/preprocessing/base/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/chunk.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/document.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/embedded_chunk.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/vectordb.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/chunk.py +0 -13
rag_demo/preprocessing/base/document.py +0 -19
rag_demo/preprocessing/base/embedded_chunk.py +0 -34
rag_demo/preprocessing/base/embeddings.py +0 -45
rag_demo/preprocessing/base/vectordb.py +0 -289
rag_demo/preprocessing/chunking.py +0 -26
rag_demo/preprocessing/embed.py +0 -57
rag_demo/preprocessing/load_to_vectordb.py +0 -30
rag_demo/preprocessing/pdf_conversion.py +0 -33
rag_demo/rag/__pycache__/prompt_templates.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/query_expansion.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/reranker.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/retriever.cpython-311.pyc +0 -0
rag_demo/rag/base/__init__.py +0 -3
rag_demo/rag/base/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/rag/base/__pycache__/query.cpython-311.pyc +0 -0
rag_demo/rag/base/__pycache__/template_factory.cpython-311.pyc +0 -0
rag_demo/rag/base/base.py +0 -22
rag_demo/rag/base/query.py +0 -29
rag_demo/rag/base/template_factory.py +0 -22
rag_demo/rag/prompt_templates.py +0 -38
rag_demo/rag/query_expansion.py +0 -39
rag_demo/rag/reranker.py +0 -24
rag_demo/rag/retriever.py +0 -133
rag_demo/settings.py +0 -40
rag_demo/static/Matriv-white.png +0 -0
rag_demo/templates/chat.html +0 -333
rag_demo/templates/upload.html +0 -193

rag_demo/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .infra.qdrant import connection
-__all__ = ["connection"]

rag_demo/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (177 Bytes)

rag_demo/__pycache__/pipeline.cpython-311.pyc DELETED Viewed

Binary file (738 Bytes)

rag_demo/__pycache__/settings.cpython-311.pyc DELETED Viewed

Binary file (1.99 kB)

rag_demo/app.py DELETED Viewed

@@ -1,81 +0,0 @@
-from fastapi import FastAPI, File, UploadFile, Request
-from fastapi.templating import Jinja2Templates
-from fastapi.responses import HTMLResponse
-from fastapi.staticfiles import StaticFiles
-from pydantic import BaseModel
-import os
-from pipeline import process_pdf
-import nest_asyncio
-from rag.retriever import RAGPipeline
-from loguru import logger
-app = FastAPI()
-# Apply nest_asyncio at the start of the application
-nest_asyncio.apply()
-# Create templates directory if it doesn't exist
-templates = Jinja2Templates(directory="templates")
-app.mount("/static", StaticFiles(directory="static"), name="static")
-class ChatRequest(BaseModel):
-    question: str
-@app.get("/", response_class=HTMLResponse)
-async def upload_page(request: Request):
-    return templates.TemplateResponse("upload.html", {"request": request})
-@app.get("/chat", response_class=HTMLResponse)
-async def chat_page(request: Request):
-    return templates.TemplateResponse("chat.html", {"request": request})
-@app.post("/upload")
-async def upload_pdf(request: Request, file: UploadFile = File(...)):
-    try:
-        # Create uploads directory if it doesn't exist
-        os.makedirs("data", exist_ok=True)
-        file_path = f"data/{file.filename}"
-        with open(file_path, "wb") as buffer:
-            content = await file.read()
-            buffer.write(content)
-        # Process the PDF file with proper await statements
-        await process_pdf(file_path)
-        # Return template response with success message
-        return templates.TemplateResponse(
-            "upload.html",
-            {
-                "request": request,
-                "message": f"Successfully processed {file.filename}",
-                "processing": False,
-            },
-        )
-    except Exception as e:
-        return templates.TemplateResponse(
-            "upload.html", {"request": request, "error": str(e), "processing": False}
-        )
-@app.post("/chat")
-async def chat(chat_request: ChatRequest):
-    rag_pipeline = RAGPipeline()
-    try:
-        answer = rag_pipeline.rag(chat_request.question)
-        print(answer)
-        logger.info(answer)
-        return {"answer": answer}
-    except Exception as e:
-        return {"error": str(e)}
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

rag_demo/data/test.pdf DELETED Viewed

Binary file (344 kB)

rag_demo/data/test2.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3041eb7dd274b02a2f18049891dc3f184dff4151796f225b92cd34d676ba923
-size 1962780

rag_demo/infra/__pycache__/qdrant.cpython-311.pyc DELETED Viewed

Binary file (1.32 kB)

rag_demo/infra/qdrant.py DELETED Viewed

@@ -1,25 +0,0 @@
-from loguru import logger
-from qdrant_client import QdrantClient
-from qdrant_client.http.exceptions import UnexpectedResponse
-class QdrantDatabaseConnector:
-    _instance: QdrantClient | None = None
-    def __new__(cls, *args, **kwargs) -> QdrantClient:
-        if cls._instance is None:
-            try:
-                cls._instance = QdrantClient(":memory:")
-                logger.info(f"Connection to Qdrant DB with URI successful")
-            except:
-                logger.exception(
-                    "Couldn't connect to Qdrant.",
-                )
-                raise
-        return cls._instance
-connection = QdrantDatabaseConnector()

rag_demo/pipeline.py DELETED Viewed

@@ -1,13 +0,0 @@
-from preprocessing import (
-    convert_pdf_to_text,
-    load_to_vector_db,
-    chunk_and_embed,
-)
-from loguru import logger
-def process_pdf(file_path: str):
-    convert = convert_pdf_to_text([file_path])
-    embedded_chunks = chunk_and_embed([convert])
-    load_to_vector_db(embedded_chunks)
-    return True

rag_demo/preprocessing/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .pdf_conversion import convert_pdf_to_text
-from .load_to_vectordb import load_to_vector_db
-from .embed import chunk_and_embed
-__all__ = ["convert_pdf_to_text", "load_to_vector_db", "chunk_and_embed"]

rag_demo/preprocessing/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (441 Bytes)

rag_demo/preprocessing/__pycache__/chunking.cpython-311.pyc DELETED Viewed

Binary file (1.25 kB)

rag_demo/preprocessing/__pycache__/embed.cpython-311.pyc DELETED Viewed

Binary file (3.53 kB)

rag_demo/preprocessing/__pycache__/load_to_vectordb.cpython-311.pyc DELETED Viewed

Binary file (2.39 kB)

rag_demo/preprocessing/__pycache__/pdf_conversion.cpython-311.pyc DELETED Viewed

Binary file (1.8 kB)

rag_demo/preprocessing/base/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-from .document import Document, CleanedDocument
-from .chunk import Chunk
-from .embedded_chunk import EmbeddedChunk
-from .vectordb import VectorBaseDocument
-__all__ = [
-    "Document",
-    "CleanedDocument",
-    "Chunk",
-    "EmbeddedChunk",
-    "VectorBaseDocument",
-]

rag_demo/preprocessing/base/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (528 Bytes)

rag_demo/preprocessing/base/__pycache__/chunk.cpython-311.pyc DELETED Viewed

Binary file (927 Bytes)

rag_demo/preprocessing/base/__pycache__/document.cpython-311.pyc DELETED Viewed

Binary file (1.12 kB)

rag_demo/preprocessing/base/__pycache__/embedded_chunk.cpython-311.pyc DELETED Viewed

Binary file (2.04 kB)

rag_demo/preprocessing/base/__pycache__/vectordb.cpython-311.pyc DELETED Viewed

Binary file (16.7 kB)

rag_demo/preprocessing/base/chunk.py DELETED Viewed

@@ -1,13 +0,0 @@
-from abc import ABC
-from typing import Optional
-from pydantic import UUID4, Field
-from .vectordb import VectorBaseDocument
-class Chunk(VectorBaseDocument, ABC):
-    content: str
-    document_id: UUID4
-    chunk_id: UUID4
-    metadata: dict = Field(default_factory=dict)

rag_demo/preprocessing/base/document.py DELETED Viewed

@@ -1,19 +0,0 @@
-from abc import ABC
-from typing import Optional
-from pydantic import UUID4, BaseModel
-from .vectordb import VectorBaseDocument
-class CleanedDocument(VectorBaseDocument, ABC):
-    content: str
-    doc_id: UUID4
-    doc_title: str
-    # doc_url: str
-class Document(BaseModel):
-    text: str
-    document_id: UUID4
-    metadata: dict

rag_demo/preprocessing/base/embedded_chunk.py DELETED Viewed

@@ -1,34 +0,0 @@
-from abc import ABC
-from pydantic import UUID4, Field
-from .vectordb import VectorBaseDocument
-class EmbeddedChunk(VectorBaseDocument, ABC):
-    content: str
-    embedding: list[float] | None
-    document_id: UUID4
-    chunk_id: UUID4
-    metadata: dict = Field(default_factory=dict)
-    similarity: float | None
-    @classmethod
-    def to_context(cls, chunks: list["EmbeddedChunk"]) -> str:
-        context = ""
-        for i, chunk in enumerate(chunks):
-            context += f"""
-            Chunk {i + 1}:
-            Type: {chunk.__class__.__name__}
-            Document ID: {chunk.document_id}
-            Chunk ID: {chunk.chunk_id}
-            Content: {chunk.content}\n
-            """
-        return context
-    class Config:
-        name = "embedded_documents"
-        category = "Document"
-        use_vector_index = True

rag_demo/preprocessing/base/embeddings.py DELETED Viewed

@@ -1,45 +0,0 @@
-from functools import cached_property
-from pathlib import Path
-from typing import Optional, ClassVar
-class SingletonMeta(type):
-    """
-    This is a thread-safe implementation of Singleton.
-    """
-    _instances: ClassVar = {}
-    _lock: Lock = Lock()
-    """
-    We now have a lock object that will be used to synchronize threads during
-    first access to the Singleton.
-    """
-    def __call__(cls, *args, **kwargs):
-        """
-        Possible changes to the value of the `__init__` argument do not affect
-        the returned instance.
-        """
-        # Now, imagine that the program has just been launched. Since there's no
-        # Singleton instance yet, multiple threads can simultaneously pass the
-        # previous conditional and reach this point almost at the same time. The
-        # first of them will acquire lock and will proceed further, while the
-        # rest will wait here.
-        with cls._lock:
-            # The first thread to acquire the lock, reaches this conditional,
-            # goes inside and creates the Singleton instance. Once it leaves the
-            # lock block, a thread that might have been waiting for the lock
-            # release may then enter this section. But since the Singleton field
-            # is already initialized, the thread won't create a new object.
-            if cls not in cls._instances:
-                instance = super().__call__(*args, **kwargs)
-                cls._instances[cls] = instance
-        return cls._instances[cls]

rag_demo/preprocessing/base/vectordb.py DELETED Viewed

@@ -1,289 +0,0 @@
-import uuid
-from abc import ABC
-from typing import Any, Callable, Dict, Generic, Type, TypeVar
-from uuid import UUID
-import numpy as np
-from loguru import logger
-from pydantic import UUID4, BaseModel, Field
-from qdrant_client.http import exceptions
-from qdrant_client.http.models import Distance, VectorParams
-from qdrant_client.models import CollectionInfo, PointStruct, Record
-from rag_demo import connection
-T = TypeVar("T", bound="VectorBaseDocument")
-EMBEDDING_SIZE = 1024
-class VectorBaseDocument(BaseModel, Generic[T], ABC):
-    id: UUID4 = Field(default_factory=uuid.uuid4)
-    def __eq__(self, value: object) -> bool:
-        if not isinstance(value, self.__class__):
-            return False
-        return self.id == value.id
-    def __hash__(self) -> int:
-        return hash(self.id)
-    @classmethod
-    def from_record(cls: Type[T], point: Record) -> T:
-        _id = UUID(point.id, version=4)
-        payload = point.payload or {}
-        attributes = {
-            "id": _id,
-            **payload,
-        }
-        if cls._has_class_attribute("embedding"):
-            attributes["embedding"] = point.vector or None
-        return cls(**attributes)
-    def to_point(self: T, **kwargs) -> PointStruct:
-        exclude_unset = kwargs.pop("exclude_unset", False)
-        by_alias = kwargs.pop("by_alias", True)
-        payload = self.model_dump(
-            exclude_unset=exclude_unset, by_alias=by_alias, **kwargs
-        )
-        _id = str(payload.pop("id"))
-        vector = payload.pop("embedding", {})
-        if vector and isinstance(vector, np.ndarray):
-            vector = vector.tolist()
-        return PointStruct(id=_id, vector=vector, payload=payload)
-    def model_dump(self: T, **kwargs) -> dict:
-        dict_ = super().model_dump(**kwargs)
-        dict_ = self._uuid_to_str(dict_)
-        return dict_
-    def _uuid_to_str(self, item: Any) -> Any:
-        if isinstance(item, dict):
-            for key, value in item.items():
-                if isinstance(value, UUID):
-                    item[key] = str(value)
-                elif isinstance(value, list):
-                    item[key] = [self._uuid_to_str(v) for v in value]
-                elif isinstance(value, dict):
-                    item[key] = {k: self._uuid_to_str(v) for k, v in value.items()}
-        return item
-    @classmethod
-    def bulk_insert(cls: Type[T], documents: list["VectorBaseDocument"]) -> bool:
-        try:
-            cls._bulk_insert(documents)
-            logger.info(
-                f"Successfully inserted {len(documents)} documents into {cls.get_collection_name()}"
-            )
-        except Exception as e:
-            logger.error(f"Error inserting documents: {e}")
-            logger.info(
-                f"Collection '{cls.get_collection_name()}' does not exist. Trying to create the collection and reinsert the documents."
-            )
-            cls.create_collection()
-            try:
-                cls._bulk_insert(documents)
-            except Exception as e:
-                logger.error(f"Error inserting documents: {e}")
-                logger.error(
-                    f"Failed to insert documents in '{cls.get_collection_name()}'."
-                )
-                return False
-        return True
-    @classmethod
-    def _bulk_insert(cls: Type[T], documents: list["VectorBaseDocument"]) -> None:
-        points = [doc.to_point() for doc in documents]
-        connection.upsert(collection_name=cls.get_collection_name(), points=points)
-    @classmethod
-    def bulk_find(
-        cls: Type[T], limit: int = 10, **kwargs
-    ) -> tuple[list[T], UUID | None]:
-        try:
-            documents, next_offset = cls._bulk_find(limit=limit, **kwargs)
-        except exceptions.UnexpectedResponse:
-            logger.error(
-                f"Failed to search documents in '{cls.get_collection_name()}'."
-            )
-            documents, next_offset = [], None
-        return documents, next_offset
-    @classmethod
-    def _bulk_find(
-        cls: Type[T], limit: int = 10, **kwargs
-    ) -> tuple[list[T], UUID | None]:
-        collection_name = cls.get_collection_name()
-        offset = kwargs.pop("offset", None)
-        offset = str(offset) if offset else None
-        records, next_offset = connection.scroll(
-            collection_name=collection_name,
-            limit=limit,
-            with_payload=kwargs.pop("with_payload", True),
-            with_vectors=kwargs.pop("with_vectors", False),
-            offset=offset,
-            **kwargs,
-        )
-        documents = [cls.from_record(record) for record in records]
-        if next_offset is not None:
-            next_offset = UUID(next_offset, version=4)
-        return documents, next_offset
-    @classmethod
-    def search(cls: Type[T], query_vector: list, limit: int = 10, **kwargs) -> list[T]:
-        try:
-            documents = cls._search(query_vector=query_vector, limit=limit, **kwargs)
-        except exceptions.UnexpectedResponse:
-            logger.error(
-                f"Failed to search documents in '{cls.get_collection_name()}'."
-            )
-            documents = []
-        return documents
-    @classmethod
-    def _search(cls: Type[T], query_vector: list, limit: int = 10, **kwargs) -> list[T]:
-        collection_name = cls.get_collection_name()
-        records = connection.search(
-            collection_name=collection_name,
-            query_vector=query_vector,
-            limit=limit,
-            with_payload=kwargs.pop("with_payload", True),
-            with_vectors=kwargs.pop("with_vectors", False),
-            **kwargs,
-        )
-        documents = [cls.from_record(record) for record in records]
-        return documents
-    @classmethod
-    def get_or_create_collection(cls: Type[T]) -> CollectionInfo:
-        collection_name = cls.get_collection_name()
-        try:
-            return connection.get_collection(collection_name=collection_name)
-        except exceptions.UnexpectedResponse:
-            use_vector_index = cls.get_use_vector_index()
-            collection_created = cls._create_collection(
-                collection_name=collection_name, use_vector_index=use_vector_index
-            )
-            if collection_created is False:
-                raise RuntimeError(
-                    f"Couldn't create collection {collection_name}"
-                ) from None
-            return connection.get_collection(collection_name=collection_name)
-    @classmethod
-    def create_collection(cls: Type[T]) -> bool:
-        collection_name = cls.get_collection_name()
-        use_vector_index = cls.get_use_vector_index()
-        logger.info(
-            f"Creating collection {collection_name} with use_vector_index={use_vector_index}"
-        )
-        return cls._create_collection(
-            collection_name=collection_name, use_vector_index=use_vector_index
-        )
-    @classmethod
-    def _create_collection(
-        cls, collection_name: str, use_vector_index: bool = True
-    ) -> bool:
-        if use_vector_index is True:
-            vectors_config = VectorParams(size=EMBEDDING_SIZE, distance=Distance.COSINE)
-        else:
-            vectors_config = {}
-        return connection.create_collection(
-            collection_name=collection_name, vectors_config=vectors_config
-        )
-    @classmethod
-    def get_collection_name(cls: Type[T]) -> str:
-        if not hasattr(cls, "Config") or not hasattr(cls.Config, "name"):
-            raise Exception(
-                f"The class {cls} should define a Config class with the 'name' property that reflects the collection's name."
-            )
-        return cls.Config.name
-    @classmethod
-    def get_use_vector_index(cls: Type[T]) -> bool:
-        if not hasattr(cls, "Config") or not hasattr(cls.Config, "use_vector_index"):
-            return True
-        return cls.Config.use_vector_index
-    @classmethod
-    def group_by_class(
-        cls: Type["VectorBaseDocument"], documents: list["VectorBaseDocument"]
-    ) -> Dict["VectorBaseDocument", list["VectorBaseDocument"]]:
-        return cls._group_by(documents, selector=lambda doc: doc.__class__)
-    @classmethod
-    def _group_by(
-        cls: Type[T], documents: list[T], selector: Callable[[T], Any]
-    ) -> Dict[Any, list[T]]:
-        grouped = {}
-        for doc in documents:
-            key = selector(doc)
-            if key not in grouped:
-                grouped[key] = []
-            grouped[key].append(doc)
-        return grouped
-    @classmethod
-    def collection_name_to_class(
-        cls: Type["VectorBaseDocument"], collection_name: str
-    ) -> type["VectorBaseDocument"]:
-        for subclass in cls.__subclasses__():
-            try:
-                if subclass.get_collection_name() == collection_name:
-                    return subclass
-            except Exception:
-                pass
-            try:
-                return subclass.collection_name_to_class(collection_name)
-            except ValueError:
-                continue
-        raise ValueError(f"No subclass found for collection name: {collection_name}")
-    @classmethod
-    def _has_class_attribute(cls: Type[T], attribute_name: str) -> bool:
-        if attribute_name in cls.__annotations__:
-            return True
-        for base in cls.__bases__:
-            if hasattr(base, "_has_class_attribute") and base._has_class_attribute(
-                attribute_name
-            ):
-                return True
-        return False

rag_demo/preprocessing/chunking.py DELETED Viewed

@@ -1,26 +0,0 @@
-from uuid import uuid4
-from langchain.text_splitter import MarkdownTextSplitter
-from .base import Chunk
-from .base import Document
-def chunk_text(
-    document: Document, chunk_size: int = 500, chunk_overlap: int = 50
-) -> list[Chunk]:
-    text_splitter = MarkdownTextSplitter(
-        chunk_size=chunk_size, chunk_overlap=chunk_overlap
-    )
-    chunks = text_splitter.split_text(document.text)
-    result = []
-    for chunk in chunks:
-        result.append(
-            Chunk(
-                content=chunk,
-                document_id=document.document_id,
-                chunk_id=uuid4(),
-                metadata=document.metadata,
-            )
-        )
-    return result

rag_demo/preprocessing/embed.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing_extensions import Annotated
-from typing import Generator
-from .base import Chunk
-from .base import EmbeddedChunk
-from .chunking import chunk_text
-from huggingface_hub import InferenceClient
-import os
-from dotenv import load_dotenv
-from uuid import uuid4
-from loguru import logger
-load_dotenv()
-def batch(list_: list, size: int) -> Generator[list, None, None]:
-    yield from (list_[i : i + size] for i in range(0, len(list_), size))
-def embed_chunks(chunks: list[Chunk]) -> list[EmbeddedChunk]:
-    api = InferenceClient(
-        model="intfloat/multilingual-e5-large-instruct",
-        token=os.getenv("HF_API_TOKEN"),
-    )
-    logger.info(f"Embedding {len(chunks)} chunks")
-    embedded_chunks = []
-    for chunk in chunks:
-        try:
-            embedded_chunks.append(
-                EmbeddedChunk(
-                    id=uuid4(),
-                    content=chunk.content,
-                    embedding=api.feature_extraction(chunk.content),
-                    document_id=chunk.document_id,
-                    chunk_id=chunk.chunk_id,
-                    metadata=chunk.metadata,
-                    similarity=None,
-                )
-            )
-        except Exception as e:
-            logger.error(f"Error embedding chunk: {e}")
-    logger.info(f"{len(embedded_chunks)} chunks embedded successfully")
-    return embedded_chunks
-def chunk_and_embed(
-    cleaned_documents: Annotated[list, "cleaned_documents"],
-) -> Annotated[list, "embedded_documents"]:
-    embedded_chunks = []
-    for document in cleaned_documents:
-        chunks = chunk_text(document)
-        for batched_chunks in batch(chunks, 10):
-            batched_embedded_chunks = embed_chunks(batched_chunks)
-            embedded_chunks.extend(batched_embedded_chunks)
-    logger.info(f"{len(embedded_chunks)} chunks embedded successfully")
-    return embedded_chunks

rag_demo/preprocessing/load_to_vectordb.py DELETED Viewed

@@ -1,30 +0,0 @@
-from loguru import logger
-from typing_extensions import Annotated
-from typing import Generator
-from .base import VectorBaseDocument
-def batch(list_: list, size: int) -> Generator[list, None, None]:
-    yield from (list_[i : i + size] for i in range(0, len(list_), size))
-def load_to_vector_db(
-    documents: Annotated[list, "documents"],
-) -> Annotated[bool, "successful"]:
-    logger.info(f"Loading {len(documents)} documents into the vector database.")
-    grouped_documents = VectorBaseDocument.group_by_class(documents)
-    for document_class, documents in grouped_documents.items():
-        logger.info(f"Loading documents into {document_class.get_collection_name()}")
-        for documents_batch in batch(documents, size=4):
-            try:
-                document_class.bulk_insert(documents_batch)
-            except Exception as e:
-                logger.error(
-                    f"Failed to insert documents into {document_class.get_collection_name()}: {e}"
-                )
-                return False
-    return True

rag_demo/preprocessing/pdf_conversion.py DELETED Viewed

@@ -1,33 +0,0 @@
-from llama_parse import LlamaParse
-from llama_index.core import SimpleDirectoryReader
-from uuid import uuid4
-from .base import Document
-from loguru import logger
-from dotenv import load_dotenv
-load_dotenv()
-# set up parser
-parser = LlamaParse(
-    api_key="llx-TN6YSXvZdpG0qhJ7rVx9QFg5Zq298RXr7Id7XzXb5Wr4Rnpt",
-    result_type="markdown",  # "markdown" and "text" are available
-)
-def convert_pdf_to_text(filepaths: list[str]) -> Document:
-    file_extractor = {".pdf": parser}
-    # use SimpleDirectoryReader to parse our file
-    documents = SimpleDirectoryReader(
-        input_files=filepaths, file_extractor=file_extractor
-    ).load_data()
-    logger.info("Converted 1 documents")
-    return Document(
-        document_id=uuid4(),
-        text=" ".join(document.text for document in documents),
-        metadata={"filename": filepaths[0].split("/")[-1]},
-    )

rag_demo/rag/__pycache__/prompt_templates.cpython-311.pyc DELETED Viewed

Binary file (2.75 kB)

rag_demo/rag/__pycache__/query_expansion.cpython-311.pyc DELETED Viewed

Binary file (2.4 kB)

rag_demo/rag/__pycache__/reranker.cpython-311.pyc DELETED Viewed

Binary file (1.96 kB)

rag_demo/rag/__pycache__/retriever.cpython-311.pyc DELETED Viewed

Binary file (8.21 kB)

rag_demo/rag/base/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .template_factory import PromptTemplateFactory
-__all__ = [PromptTemplateFactory]

rag_demo/rag/base/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (283 Bytes)

rag_demo/rag/base/__pycache__/query.cpython-311.pyc DELETED Viewed

Binary file (2.08 kB)

rag_demo/rag/base/__pycache__/template_factory.cpython-311.pyc DELETED Viewed

Binary file (1.64 kB)

rag_demo/rag/base/base.py DELETED Viewed

@@ -1,22 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any
-from langchain.prompts import PromptTemplate
-from pydantic import BaseModel
-from rag_demo.rag.base.query import Query
-class PromptTemplateFactory(ABC, BaseModel):
-    @abstractmethod
-    def create_template(self) -> PromptTemplate:
-        pass
-class RAGStep(ABC):
-    def __init__(self, mock: bool = False) -> None:
-        self._mock = mock
-    @abstractmethod
-    def generate(self, query: Query, *args, **kwargs) -> Any:
-        pass

rag_demo/rag/base/query.py DELETED Viewed

@@ -1,29 +0,0 @@
-from pydantic import UUID4, Field
-from rag_demo.preprocessing.base import VectorBaseDocument
-class Query(VectorBaseDocument):
-    content: str
-    metadata: dict = Field(default_factory=dict)
-    class Config:
-        category = "query"
-    @classmethod
-    def from_str(cls, query: str) -> "Query":
-        return Query(content=query.strip("\n "))
-    def replace_content(self, new_content: str) -> "Query":
-        return Query(
-            id=self.id,
-            content=new_content,
-            metadata=self.metadata,
-        )
-class EmbeddedQuery(Query):
-    embedding: list[float]
-    class Config:
-        category = "query"

rag_demo/rag/base/template_factory.py DELETED Viewed

@@ -1,22 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any
-from langchain.prompts import PromptTemplate
-from pydantic import BaseModel
-from .query import Query
-class PromptTemplateFactory(ABC, BaseModel):
-    @abstractmethod
-    def create_template(self) -> PromptTemplate:
-        pass
-class RAGStep(ABC):
-    def __init__(self, mock: bool = False) -> None:
-        self._mock = mock
-    @abstractmethod
-    def generate(self, query: Query, *args, **kwargs) -> Any:
-        pass

rag_demo/rag/prompt_templates.py DELETED Viewed

@@ -1,38 +0,0 @@
-from langchain.prompts import PromptTemplate
-from .base import PromptTemplateFactory
-class QueryExpansionTemplate(PromptTemplateFactory):
-    prompt: str = """You are an AI language model assistant. Your task is to generate {expand_to_n}
-    different versions of the given user question to retrieve relevant documents from a vector
-    database. By generating multiple perspectives on the user question, your goal is to help
-    the user overcome some of the limitations of the distance-based similarity search.
-    Provide these alternative questions seperated by '{separator}'.
-    Original question: {question}"""
-    @property
-    def separator(self) -> str:
-        return "#next-question#"
-    def create_template(self, expand_to_n: int) -> PromptTemplate:
-        return PromptTemplate(
-            template=self.prompt,
-            input_variables=["question"],
-            partial_variables={
-                "separator": self.separator,
-                "expand_to_n": expand_to_n,
-            },
-        )
-class AnswerGenerationTemplate(PromptTemplateFactory):
-    prompt: str = """You are an AI language model assistant. Your task is to generate an answer to the given user question based on the provided context.
-    Context: {context}
-    Question: {question}
-    Give your answer in markdown format.
-    Give only your answer, do not include any other text like 'Certainly! Here is the answer:' or 'The answer is:' or anything similar."""
-    def create_template(self, context: str, question: str) -> str:
-        return self.prompt.format(context=context, question=question)

rag_demo/rag/query_expansion.py DELETED Viewed

@@ -1,39 +0,0 @@
-import os
-from typing import Any
-from huggingface_hub import InferenceClient
-from .base.query import Query
-from .base.template_factory import RAGStep
-from .prompt_templates import QueryExpansionTemplate
-class QueryExpansion(RAGStep):
-    def generate(self, query: Query, expand_to_n: int) -> Any:
-        api = InferenceClient(
-            model="Qwen/Qwen2.5-72B-Instruct",
-            token=os.getenv("HF_API_TOKEN"),
-        )
-        query_expansion_template = QueryExpansionTemplate()
-        prompt = query_expansion_template.create_template(expand_to_n - 1)
-        response = api.chat_completion(
-            [
-                {
-                    "role": "user",
-                    "content": prompt.template.format(
-                        question=query.content,
-                        expand_to_n=expand_to_n,
-                        separator=query_expansion_template.separator,
-                    ),
-                }
-            ]
-        )
-        result = response.choices[0].message.content
-        queries_content = result.split(query_expansion_template.separator)
-        queries = [query]
-        queries += [
-            query.replace_content(stripped_content)
-            for content in queries_content
-            if (stripped_content := content.strip())
-        ]
-        return queries

rag_demo/rag/reranker.py DELETED Viewed

@@ -1,24 +0,0 @@
-import os
-from huggingface_hub import InferenceClient
-from .base.query import Query
-from .base.template_factory import RAGStep
-from .preprocessing.embed import EmbeddedChunk
-class Reranker(RAGStep):
-    def generate(
-        self, query: Query, chunks: list[EmbeddedChunk], keep_top_k: int
-    ) -> list[EmbeddedChunk]:
-        api = InferenceClient(
-            model="intfloat/multilingual-e5-large-instruct",
-            token=os.getenv("HF_API_TOKEN"),
-        )
-        similarity = api.sentence_similarity(
-            query.content, [chunk.content for chunk in chunks]
-        )
-        for chunk, sim in zip(chunks, similarity):
-            chunk.similarity = sim
-        return sorted(chunks, key=lambda x: x.similarity, reverse=True)[:keep_top_k]

rag_demo/rag/retriever.py DELETED Viewed

@@ -1,133 +0,0 @@
-import concurrent.futures
-import os
-from loguru import logger
-from qdrant_client.models import FieldCondition, Filter, MatchValue
-from huggingface_hub import InferenceClient
-from ..preprocessing.base import (
-    EmbeddedChunk,
-)
-from .base.query import EmbeddedQuery, Query
-from .query_expansion import QueryExpansion
-from .reranker import Reranker
-from .prompt_templates import AnswerGenerationTemplate
-from dotenv import load_dotenv
-load_dotenv()
-def flatten(nested_list: list) -> list:
-    """Flatten a list of lists into a single list."""
-    return [item for sublist in nested_list for item in sublist]
-class RAGPipeline:
-    def __init__(self, mock: bool = False) -> None:
-        self._query_expander = QueryExpansion(mock=mock)
-        self._reranker = Reranker(mock=mock)
-    def search(
-        self,
-        query: str,
-        k: int = 3,
-        expand_to_n_queries: int = 3,
-    ) -> list:
-        query_model = Query.from_str(query)
-        n_generated_queries = self._query_expander.generate(
-            query_model, expand_to_n=expand_to_n_queries
-        )
-        logger.info(
-            f"Successfully generated {len(n_generated_queries)} search queries.",
-        )
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            search_tasks = [
-                executor.submit(self._search, _query_model, k)
-                for _query_model in n_generated_queries
-            ]
-            n_k_documents = [
-                task.result() for task in concurrent.futures.as_completed(search_tasks)
-            ]
-            n_k_documents = flatten(n_k_documents)
-            n_k_documents = list(set(n_k_documents))
-        logger.info(f"{len(n_k_documents)} documents retrieved successfully")
-        if len(n_k_documents) > 0:
-            k_documents = self.rerank(query, chunks=n_k_documents, keep_top_k=k)
-        else:
-            k_documents = []
-        return k_documents
-    def _search(self, query: Query, k: int = 3) -> list[EmbeddedChunk]:
-        assert k >= 3, "k should be >= 3"
-        def _search_data(
-            data_category_odm: type[EmbeddedChunk], embedded_query: EmbeddedQuery
-        ) -> list[EmbeddedChunk]:
-            return data_category_odm.search(
-                query_vector=embedded_query.embedding,
-                limit=k,
-            )
-        api = InferenceClient(
-            model="intfloat/multilingual-e5-large-instruct",
-            token=os.getenv("HF_API_TOKEN"),
-        )
-        embedded_query: EmbeddedQuery = EmbeddedQuery(
-            embedding=api.feature_extraction(query.content),
-            id=query.id,
-            content=query.content,
-        )
-        retrieved_chunks = _search_data(EmbeddedChunk, embedded_query)
-        logger.info(f"{len(retrieved_chunks)} documents retrieved successfully")
-        return retrieved_chunks
-    def rerank(
-        self, query: str | Query, chunks: list[EmbeddedChunk], keep_top_k: int
-    ) -> list[EmbeddedChunk]:
-        if isinstance(query, str):
-            query = Query.from_str(query)
-        reranked_documents = self._reranker.generate(
-            query=query, chunks=chunks, keep_top_k=keep_top_k
-        )
-        logger.info(f"{len(reranked_documents)} documents reranked successfully.")
-        return reranked_documents
-    def generate_answer(self, query: str, reranked_chunks: list[EmbeddedChunk]) -> str:
-        context = ""
-        for chunk in reranked_chunks:
-            context += "\n Document: "
-            context += chunk.content
-        api = InferenceClient(
-            model="meta-llama/Llama-3.1-8B-Instruct",
-            token=os.getenv("HF_API_TOKEN"),
-        )
-        answer_generation_template = AnswerGenerationTemplate()
-        prompt = answer_generation_template.create_template(context, query)
-        logger.info(prompt)
-        response = api.chat_completion(
-            [{"role": "user", "content": prompt}],
-            max_tokens=8192,
-        )
-        return response.choices[0].message.content
-    def rag(self, query: str) -> tuple[str, list[str]]:
-        docs = self.search(query, k=10)
-        reranked_docs = self.rerank(query, docs, keep_top_k=10)
-        return (
-            self.generate_answer(query, reranked_docs),
-            [doc.metadata["filename"].split(".pdf")[0] for doc in reranked_docs],
-        )

rag_demo/settings.py DELETED Viewed

@@ -1,40 +0,0 @@
-from loguru import logger
-from pydantic_settings import BaseSettings, SettingsConfigDict
-class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
-    # Huggingface API
-    HF_API_KEY: str | None = None
-    # LlamaParse API
-    LLAMA_PARSE_API_KEY: str | None = None
-    # Qdrant vector database
-    USE_QDRANT_CLOUD: bool = False
-    QDRANT_DATABASE_HOST: str = "localhost"
-    QDRANT_DATABASE_PORT: int = 6333
-    QDRANT_CLOUD_URL: str = "str"
-    QDRANT_APIKEY: str | None = None
-    # RAG
-    TEXT_EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2"
-    RERANKING_CROSS_ENCODER_MODEL_ID: str = "cross-encoder/ms-marco-MiniLM-L-4-v2"
-    RAG_MODEL_DEVICE: str = "cpu"
-    @classmethod
-    def load_settings(cls) -> "Settings":
-        """
-        Tries to load the settings from the ZenML secret store. If the secret does not exist, it initializes the settings from the .env file and default values.
-        Returns:
-            Settings: The initialized settings object.
-        """
-        settings = Settings()
-        return settings
-settings = Settings.load_settings()

rag_demo/static/Matriv-white.png DELETED Viewed

Binary file (6.5 kB)

rag_demo/templates/chat.html DELETED Viewed

@@ -1,333 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <title>RAG Chatbot</title>
-    <style>
-        :root {
-            --primary-color: #a0a0a0;
-            --background-color: #1a1a1a;
-            --card-background: #2d2d2d;
-            --text-color: #e0e0e0;
-            --border-radius: 6px;
-            --shadow: 0 4px 6px rgba(0, 0, 0, 0.3);
-            --input-background: #363636;
-            --input-border: #404040;
-        }
-        body {
-            font-family: 'Segoe UI', Arial, sans-serif;
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: var(--background-color);
-            color: var(--text-color);
-        }
-        .card {
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            padding: 2rem;
-            margin: 2rem 0;
-        }
-        .chat-container {
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            padding: 1.5rem;
-            height: 700px;
-            overflow-y: auto;
-            margin-bottom: 1.5rem;
-            box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05);
-            border: 1px solid var(--input-border);
-        }
-        .message {
-            margin-bottom: 1rem;
-            padding: 1rem;
-            border-radius: 4px;
-            max-width: 70%;
-            animation: fadeIn 0.3s ease;
-        }
-        @keyframes fadeIn {
-            from {
-                opacity: 0;
-                transform: translateY(10px);
-            }
-            to {
-                opacity: 1;
-                transform: translateY(0);
-            }
-        }
-        .user-message {
-            background-color: #808080;
-            margin-left: auto;
-            color: #ffffff;
-            box-shadow: 0 2px 4px rgba(128, 128, 128, 0.2);
-        }
-        .bot-message {
-            background-color: #363636;
-            margin-right: auto;
-            color: #e0e0e0;
-            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
-        }
-        .input-container {
-            display: flex;
-            gap: 12px;
-            padding: 1rem;
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-        }
-        .nav {
-            background: var(--card-background);
-            padding: 1rem;
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            margin-bottom: 1rem;
-        }
-        .nav a {
-            margin-right: 20px;
-            text-decoration: none;
-            color: var(--primary-color);
-            font-weight: 500;
-            padding: 0.5rem 1rem;
-            border-radius: 4px;
-            transition: all 0.3s ease;
-        }
-        .nav a:hover {
-            background: #363636;
-        }
-        #messageInput {
-            flex-grow: 1;
-            padding: 12px;
-            border: 2px solid var(--input-border);
-            border-radius: 4px;
-            font-size: 1rem;
-            transition: all 0.3s ease;
-            background: var(--input-background);
-            color: var(--text-color);
-        }
-        #messageInput:focus {
-            outline: none;
-            border-color: var(--primary-color);
-            box-shadow: 0 0 0 3px rgba(114, 137, 218, 0.1);
-        }
-        button {
-            background: var(--primary-color);
-            color: white;
-            border: none;
-            padding: 12px 24px;
-            border-radius: 4px;
-            cursor: pointer;
-            font-size: 1rem;
-            transition: all 0.3s ease;
-        }
-        button:hover {
-            background: #909090;
-            transform: translateY(-2px);
-        }
-        h1 {
-            color: var(--primary-color);
-            text-align: center;
-            margin-bottom: 1.5rem;
-        }
-        /* Scrollbar styling */
-        .chat-container::-webkit-scrollbar {
-            width: 8px;
-        }
-        .chat-container::-webkit-scrollbar-track {
-            background: #363636;
-        }
-        .chat-container::-webkit-scrollbar-thumb {
-            background: #4a4a4a;
-        }
-        .chat-container::-webkit-scrollbar-thumb:hover {
-            background: #5a5a5a;
-        }
-        /* Add these new styles */
-        .main-container {
-            display: flex;
-            gap: 20px;
-            height: calc(100vh - 100px);
-            /* Adjust for nav and padding */
-        }
-        .chat-card {
-            flex: 3;
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            padding: 2rem;
-            margin: 1rem 0;
-            display: flex;
-            flex-direction: column;
-            height: fit-content;
-        }
-        .sources-card {
-            flex: 1;
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            padding: 2rem;
-            margin: 1rem 0;
-            min-width: 250px;
-            display: flex;
-            flex-direction: column;
-            height: auto;
-        }
-        .source-item {
-            padding: 10px;
-            margin-bottom: 10px;
-            background: var(--input-background);
-            border-radius: var(--border-radius);
-            font-size: 0.9rem;
-            border: 1px solid var(--input-border);
-        }
-        .sources-title {
-            color: var(--text-color);
-            font-size: 1.2rem;
-            margin-bottom: 1rem;
-            padding-bottom: 0.5rem;
-            border-bottom: 1px solid var(--input-border);
-        }
-        #sourcesContainer {
-            flex: 1;
-            overflow-y: auto;
-        }
-        .logo-container {
-            display: flex;
-            justify-content: center;
-            align-items: center;
-            margin-bottom: 1rem;
-        }
-    </style>
-</head>
-<body>
-    <div class="nav">
-        <a href="/">Upload</a>
-        <a href="/chat">Chat</a>
-    </div>
-    <div class="main-container">
-        <div class="chat-card">
-            <div class="logo-container">
-                <img src="./static/Matriv-white.png" alt="Matriv Logo" style="width: 100px; height: auto;">
-            </div>
-            <div class="chat-container" id="chatContainer">
-            </div>
-            <div class="input-container">
-                <input type="text" id="messageInput" placeholder="Type your message...">
-                <button onclick="sendMessage()">Send</button>
-            </div>
-        </div>
-        <div class="sources-card">
-            <h2 class="sources-title">Sources</h2>
-            <div id="sourcesContainer"></div>
-        </div>
-    </div>
-    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
-    <script>
-        const chatContainer = document.getElementById('chatContainer');
-        const messageInput = document.getElementById('messageInput');
-        const sourcesContainer = document.getElementById('sourcesContainer');
-        function addMessage(message, isUser) {
-            const messageDiv = document.createElement('div');
-            messageDiv.className = `message ${isUser ? 'user-message' : 'bot-message'}`;
-            messageDiv.textContent = message;
-            chatContainer.appendChild(messageDiv);
-            chatContainer.scrollTop = chatContainer.scrollHeight;
-        }
-        function updateSources(sources) {
-            sourcesContainer.innerHTML = '';
-            if (sources && sources.length > 0) {
-                sources.forEach(source => {
-                    const sourceDiv = document.createElement('div');
-                    sourceDiv.className = 'source-item';
-                    sourceDiv.textContent = source;
-                    sourcesContainer.appendChild(sourceDiv);
-                });
-            }
-        }
-        async function sendMessage() {
-            const message = messageInput.value.trim();
-            if (!message) return;
-            addMessage(message, true);
-            messageInput.value = '';
-            try {
-                const response = await fetch('/chat', {
-                    method: 'POST',
-                    headers: {
-                        'Content-Type': 'application/json',
-                    },
-                    body: JSON.stringify({ question: message }),
-                });
-                const data = await response.json();
-                if (data.error) {
-                    addMessage(data.error, false);
-                    return;
-                }
-                // Create a temporary div to render markdown
-                const tempDiv = document.createElement('div');
-                tempDiv.innerHTML = marked.parse(data.answer[0]);
-                // Create message div with markdown content
-                const messageDiv = document.createElement('div');
-                messageDiv.className = 'message bot-message';
-                messageDiv.innerHTML = tempDiv.innerHTML;
-                chatContainer.appendChild(messageDiv);
-                chatContainer.scrollTop = chatContainer.scrollHeight;
-                // Update sources if they exist in the response
-                if (data.answer[1]) {
-                    updateSources(data.answer[1]);
-                }
-            } catch (error) {
-                console.error('Error:', error);
-                addMessage('Sorry, there was an error processing your message.', false);
-            }
-        }
-        messageInput.addEventListener('keypress', function (e) {
-            if (e.key === 'Enter') {
-                sendMessage();
-            }
-        });
-    </script>
-</body>
-</html>

rag_demo/templates/upload.html DELETED Viewed

@@ -1,193 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <title>PDF Upload</title>
-    <style>
-        :root {
-            --primary-color: #a0a0a0;
-            --background-color: #1a1a1a;
-            --card-background: #2d2d2d;
-            --text-color: #e0e0e0;
-            --border-radius: 12px;
-            --shadow: 0 4px 6px rgba(0, 0, 0, 0.3);
-        }
-        body {
-            font-family: 'Segoe UI', Arial, sans-serif;
-            max-width: 900px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: var(--background-color);
-            color: var(--text-color);
-        }
-        .card {
-            background: var(--card-background);
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            padding: 2rem;
-            margin: 2rem 0;
-        }
-        .upload-form {
-            border: 2px dashed #404040;
-            padding: 2rem;
-            text-align: center;
-            margin: 1.5rem 0;
-            border-radius: var(--border-radius);
-            background: #363636;
-            transition: all 0.3s ease;
-        }
-        .upload-form:hover {
-            border-color: var(--primary-color);
-            background: #404040;
-        }
-        .nav {
-            background: var(--card-background);
-            padding: 1rem;
-            border-radius: var(--border-radius);
-            box-shadow: var(--shadow);
-            margin-bottom: 2rem;
-        }
-        .nav a {
-            margin-right: 20px;
-            text-decoration: none;
-            color: var(--primary-color);
-            font-weight: 500;
-            padding: 0.5rem 1rem;
-            border-radius: 6px;
-            transition: all 0.3s ease;
-        }
-        .nav a:hover {
-            background: #363636;
-        }
-        h1 {
-            color: var(--primary-color);
-            text-align: center;
-            margin-bottom: 1.5rem;
-        }
-        input[type="file"] {
-            display: none;
-        }
-        .file-upload-label {
-            display: inline-block;
-            padding: 12px 24px;
-            background: var(--primary-color);
-            color: white;
-            border-radius: 6px;
-            cursor: pointer;
-            transition: all 0.3s ease;
-        }
-        .file-upload-label:hover {
-            background: #909090;
-        }
-        .selected-file {
-            margin-top: 1rem;
-            color: #b0b0b0;
-        }
-        button {
-            background: var(--primary-color);
-            color: white;
-            border: none;
-            padding: 12px 24px;
-            border-radius: 6px;
-            cursor: pointer;
-            font-size: 1rem;
-            transition: all 0.3s ease;
-            margin-top: 1rem;
-        }
-        button:hover {
-            background: #909090;
-            transform: translateY(-2px);
-        }
-        .status-message {
-            margin-top: 1rem;
-            padding: 1rem;
-            border-radius: 6px;
-            text-align: center;
-        }
-        .success {
-            background: #2e4a3d;
-            color: #7ee2b8;
-        }
-        .error {
-            background: #4a2e2e;
-            color: #e27e7e;
-        }
-        .loading-placeholder {
-            display: none;
-            margin-top: 1rem;
-            color: #b0b0b0;
-            animation: pulse 1.5s infinite;
-        }
-        @keyframes pulse {
-            0% {
-                opacity: 0.6;
-            }
-            50% {
-                opacity: 1;
-            }
-            100% {
-                opacity: 0.6;
-            }
-        }
-    </style>
-</head>
-<body>
-    <div class="nav">
-        <a href="/">Upload</a>
-        <a href="/chat">Chat</a>
-    </div>
-    <div class="card">
-        <h1>Upload Documents</h1>
-        <div class="upload-form">
-            <form action="/upload" method="post" enctype="multipart/form-data" id="uploadForm">
-                <label for="file-upload" class="file-upload-label">
-                    Choose PDF Files
-                </label>
-                <input id="file-upload" type="file" name="file" accept=".pdf" multiple onchange="updateFileName(this)">
-                <div id="selectedFile" class="selected-file"></div>
-                <div id="loadingPlaceholder" class="loading-placeholder">Processing file...</div>
-                <button type="submit" onclick="showLoading()">Upload</button>
-            </form>
-        </div>
-    </div>
-    <script>
-        function updateFileName(input) {
-            const fileNames = Array.from(input.files)
-                .map(file => file.name)
-                .join(', ');
-            document.getElementById('selectedFile').textContent = fileNames || 'No file selected';
-        }
-        function showLoading() {
-            if (document.getElementById('file-upload').files.length > 0) {
-                document.getElementById('selectedFile').style.display = 'none';
-                document.getElementById('loadingPlaceholder').style.display = 'block';
-            }
-        }
-    </script>
-</body>
-</html>