Spaces:

lazarr19
/

prompt-engine

Sleeping

App Files Files Community

Lazar Radojevic commited on Jul 27, 2024

Commit

268c7f9

1 Parent(s): b9115ea

copy from other repo

Browse files

Files changed (11) hide show

Dockerfile +23 -0
__init__.py +0 -0
poe/common-tasks.toml +52 -0
poetry.lock +0 -0
pyproject.toml +27 -0
run.py +56 -0
src/__init__.py +0 -0
src/prompt_loader.py +25 -0
src/search_engine.py +50 -0
src/similarity_scorer.py +34 -0
src/vectorizer.py +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use the official Python image from the Docker Hub
+FROM python:3.10-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install Poetry
+RUN pip install poetry
+# Copy only the pyproject.toml and poetry.lock files to install dependencies first
+COPY pyproject.toml poetry.lock ./
+# Install dependencies using Poetry
+RUN poetry config virtualenvs.create false && poetry install --only=main
+# Copy the rest of the application code to the working directory
+COPY . .
+# Expose the port FastAPI will run on
+EXPOSE 7860
+# Command to run the FastAPI application
+CMD ["poetry", "run", "uvicorn", "run:app", "--host", "0.0.0.0", "--port", "7860", "--reload"]

__init__.py ADDED Viewed

File without changes

poe/common-tasks.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# This file defines common tasks that most python projects can benefit from
+[tool.poe.tasks.format-isort]
+help = "Format code with isort"
+cmd  = "isort ."
+[tool.poe.tasks.format-black]
+help = "Format code with black"
+cmd  = "black ."
+[tool.poe.tasks.format]
+help = "Run code formating tools"
+sequence  = ["format-isort", "format-black"]
+[tool.poe.tasks.style-black]
+help = "Validate black code style"
+cmd  = "black . --check --diff"
+[tool.poe.tasks.style-isort]
+help = "Validate isort code style"
+cmd  = "isort . --check --diff"
+[tool.poe.tasks.style]
+help = "Validate code style"
+sequence = ["style-isort", "style-black"]
+[tool.poe.tasks.types]
+help = "Run the type checker"
+cmd  = "mypy . --ignore-missing-imports --check-untyped-defs --install-types --non-interactive"
+[tool.poe.tasks.lint]
+help = "Evaluate ruff rules"
+cmd  = "ruff check ."
+[tool.poe.tasks.test]
+help = "Run unit tests"
+cmd  = "pytest -p no:cacheprovider"
+[tool.poe.tasks.clean]
+help = "Remove automatically generated files"
+cmd  = """
+  rm -rf dist
+         .mypy_cache
+         .pytest_cache
+         .ruff_cache
+         ./**/__pycache__/
+         ./**/*.pyc
+"""
+[tool.poe.tasks.check]
+help     = "Run all checks on the code base"
+sequence = ["style", "types", "lint", "clean"]

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[tool.poetry]
+name = "smart-cat-assignment-backend"
+version = "0.0.1"
+description = "SmartCat Assignment"
+authors     = ["Lazar Radojevic <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.10"
+mypy = "^1.8.0"
+ruff = "^0.3.2"
+datasets = "^2.20.0"
+sentence-transformers = "^3.0.1"
+numpy = "1.26.4"
+fastapi = "^0.111.1"
+uvicorn = "^0.30.3"
+[tool.poetry.group.dev.dependencies]
+black = "^24.1.1"
+poethepoet = "^0.24.4"
+isort = "^5.13.2"
+[tool.isort]
+profile = "black"
+[tool.poe]
+include = "./poe/common-tasks.toml"

run.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+from src.search_engine import PromptSearchEngine
+from src.prompt_loader import PromptLoader
+# Constants
+SEED = 42
+DATA_SIZE = 100
+# Initialize the prompt loader and search engine
+prompts = PromptLoader(seed=SEED).load_data(size=DATA_SIZE)
+engine = PromptSearchEngine(prompts)
+# Initialize FastAPI
+app = FastAPI()
+# Request and Response Models
+class QueryRequest(BaseModel):
+    query: str
+    n: int = 5
+class SimilarPrompt(BaseModel):
+    score: float
+    prompt: str
+class QueryResponse(BaseModel):
+    similar_prompts: List[SimilarPrompt]
+# API endpoint
+@app.post("/most_similar", response_model=QueryResponse)
+async def get_most_similar(query_request: QueryRequest):
+    try:
+        similar_prompts = engine.most_similar(
+            query=query_request.query, n=query_request.n
+        )
+        response = QueryResponse(
+            similar_prompts=[
+                SimilarPrompt(score=score, prompt=prompt)
+                for score, prompt in similar_prompts
+            ]
+        )
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Run the server with: uvicorn main:app --reload
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

src/__init__.py ADDED Viewed

File without changes

src/prompt_loader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Optional, List
+from datasets import load_dataset
+import random
+class PromptLoader:
+    def __init__(self, seed: int = 42) -> None:
+        self.randomizer = random.Random(seed)
+        self.data: Optional[List[str]] = None
+    def _load_data(self) -> None:
+        self.data = load_dataset("daspartho/stable-diffusion-prompts")["train"][
+            "prompt"
+        ]
+    def load_data(self, size: Optional[int] = None) -> List[str]:
+        if not self.data:
+            self._load_data()
+        if size:
+            if size > len(self.data):
+                raise ValueError("Not enough samples available!")
+            return self.randomizer.sample(self.data, size)
+        else:
+            return self.data

src/search_engine.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import List, Sequence, Tuple
+from sentence_transformers import SentenceTransformer
+from src.vectorizer import Vectorizer
+from src.similarity_scorer import SimilarityScorer
+class PromptSearchEngine:
+    def __init__(self, prompts: Sequence[str]) -> None:
+        """Initialize search engine by vectorizing prompt corpus.
+        Vectorized prompt corpus should be used to find the top n most
+        similar prompts w.r.t. user’s input prompt.
+        Args:
+        prompts: The sequence of raw prompts from the dataset.
+        """
+        self.vectorizer = Vectorizer(SentenceTransformer("all-MiniLM-L6-v2"))
+        self.scorer = SimilarityScorer()
+        self.prompts = prompts
+        self.embeddings = self.vectorizer.transform(prompts)
+    def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
+        """Return top n most similar prompts from corpus.
+        Input query prompt should be vectorized with chosen Vectorizer.
+        After
+        that, use the cosine_similarity function to get the top n most
+        similar
+        prompts from the corpus.
+        Args:
+        query: The raw query prompt input from the user.
+        n: The number of similar prompts returned from the corpus.
+        Returns:
+        The list of top n most similar prompts from the corpus along
+        with similarity scores. Note that returned prompts are
+        verbatim.
+        """
+        query_embedding = self.vectorizer.transform(query)
+        similarities = self.scorer.cosine_similarity(query_embedding, self.embeddings)
+        # Get the top n indices with highest similarity scores
+        top_n_indices = similarities.argsort()[-n:][::-1]
+        # Retrieve the top n most similar prompts along with their similarity scores
+        top_n_similar_prompts = [
+            (similarities[i], self.prompts[i]) for i in top_n_indices
+        ]
+        return top_n_similar_prompts

src/similarity_scorer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+class SimilarityScorer:
+    def cosine_similarity(
+        self,
+        query_vector: np.ndarray,
+        corpus_vectors: np.ndarray,
+    ) -> np.ndarray:
+        """Calculate cosine similarity between prompt vectors.
+        Args:
+        query_vector: Vectorized prompt query of shape (1, D).
+        corpus_vectors: Vectorized prompt corpus of shape (N, D).
+        Returns: The vector of shape (N,) with values in range [-1, 1] where 1
+        is max similarity i.e., two vectors are the same.
+        """
+        # Normalize the query vector
+        query_norm = np.linalg.norm(query_vector)
+        if query_norm == 0:
+            raise ValueError("The query vector cannot be zero.")
+        query_vector = query_vector / query_norm
+        # Normalize the corpus vectors
+        corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
+        if np.any(corpus_norms == 0):
+            raise ValueError("The corpus contains zero vectors.")
+        normalized_corpus = corpus_vectors / corpus_norms[:, np.newaxis]
+        # Calculate cosine similarity
+        similarities = np.dot(normalized_corpus, query_vector.T)
+        return similarities

src/vectorizer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import Sequence
+import numpy as np
+class Vectorizer:
+    def __init__(self, model) -> None:
+        """Initialize the vectorizer with a pre-trained embedding model.
+        Args:
+        model: The pre-trained embedding model to use for transforming
+        prompts.
+        """
+        self.model = model
+    def transform(self, prompts: Sequence[str]) -> np.ndarray:
+        """Transform texts into numerical vectors using the specified
+        model.
+        Args:
+        prompts: The sequence of raw corpus prompts. Returns:
+        Vectorized
+        prompts as a numpy array.
+        """
+        return self.model.encode(prompts)