Spaces:

lpetrl
/

Test-API

Sleeping

App Files Files Community

lpetrl commited on Mar 21, 2024

Commit

94e8fb8

verified ·

1 Parent(s): 264a09d

feat(API): Implemented basic functionality.

Browse files

Files changed (21) hide show

Dockerfile +15 -0
data/ukrainian_nouns.txt +19 -0
database/MiniLM-L12-v.lance/_latest.manifest +0 -0
database/MiniLM-L12-v.lance/_transactions/0-5e898e5f-189c-473e-892d-8b5947b6a369.txn +1 -0
database/MiniLM-L12-v.lance/_transactions/1-9077628d-c42c-4413-8bd1-cc31ea726bce.txn +0 -0
database/MiniLM-L12-v.lance/_versions/1.manifest +0 -0
database/MiniLM-L12-v.lance/_versions/2.manifest +0 -0
database/MiniLM-L12-v.lance/data/a1b860b1-c7a3-4314-9c51-38ad78b5de8b.lance +0 -0
requirements.txt +6 -0
src/.DS_Store +0 -0
src/__pycache__/api_models.cpython-311.pyc +0 -0
src/__pycache__/app.cpython-311.pyc +0 -0
src/__pycache__/handlers.cpython-311.pyc +0 -0
src/__pycache__/setting.cpython-311.pyc +0 -0
src/__pycache__/vector_db.cpython-311.pyc +0 -0
src/api_models.py +38 -0
src/app.py +13 -0
src/create-embedding.py +25 -0
src/handlers.py +81 -0
src/setting.py +28 -0
src/vector_db.py +42 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.11
+COPY requirements.txt ./requirements.txt
+RUN python -m pip install -U pip && \
+    python -m pip install -r requirements.txt && \
+    python -m pip cache purge
+COPY ./data /app/data
+COPY ./database /app/database
+COPY ./src /app/src
+WORKDIR /app
+CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "7860"]

data/ukrainian_nouns.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+    лікар
+    програміст
+    пілот
+    літак
+    висота
+    тиск
+    барометр
+    вітер
+    медицина
+    комп'ютер
+    рука
+    око
+    ніс
+    книга
+    папір
+    олівець
+    Франція
+    Париж
+    Германія

database/MiniLM-L12-v.lance/_latest.manifest ADDED Viewed

Binary file (497 Bytes). View file

database/MiniLM-L12-v.lance/_transactions/0-5e898e5f-189c-473e-892d-8b5947b6a369.txn ADDED Viewed

	@@ -0,0 +1 @@


1	+ $5e898e5f-189c-473e-892d-8b5947b6a369�Uword ��string084vector ��fixed_size_list:float:38408

database/MiniLM-L12-v.lance/_transactions/1-9077628d-c42c-4413-8bd1-cc31ea726bce.txn ADDED Viewed

Binary file (97 Bytes). View file

database/MiniLM-L12-v.lance/_versions/1.manifest ADDED Viewed

Binary file (443 Bytes). View file

database/MiniLM-L12-v.lance/_versions/2.manifest ADDED Viewed

Binary file (497 Bytes). View file

database/MiniLM-L12-v.lance/data/a1b860b1-c7a3-4314-9c51-38ad78b5de8b.lance ADDED Viewed

Binary file (30.2 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+duckdb==0.10.1
+fastapi==0.110.0
+pandas==2.2.1
+lancedb==0.6.4
+sentence-transformers==2.5.1
+uvicorn==0.29.0

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/__pycache__/api_models.cpython-311.pyc ADDED Viewed

Binary file (2.32 kB). View file

src/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (600 Bytes). View file

src/__pycache__/handlers.cpython-311.pyc ADDED Viewed

Binary file (3.7 kB). View file

src/__pycache__/setting.cpython-311.pyc ADDED Viewed

Binary file (1.33 kB). View file

src/__pycache__/vector_db.cpython-311.pyc ADDED Viewed

Binary file (2.91 kB). View file

src/api_models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from pydantic import BaseModel, Field
+class ResponseModel(BaseModel):
+    message: str
+    data: dict
+    code: int
+class ResponseGuessWord(BaseModel):
+    word: str
+class RequestSemanticCalculation(BaseModel):
+    supposed_word: str = Field(
+        description="The word that the user is trying to guess",
+        example="ніс"
+    )
+    guessed_word: str = Field(
+        description="The word that the user guessed",
+        example="око"
+    )
+class SemanticCalculation(BaseModel):
+    score: float
+    rating: int
+    percentage: float
+    closest_word: str
+class ResponseSemanticCalculation(BaseModel):
+    word_exist: bool
+    metadata: SemanticCalculation | None
+class ResponseMessage(BaseModel):
+    message: str

src/app.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from fastapi import FastAPI
+from src.handlers import router
+def get_application() -> FastAPI:
+    application = FastAPI()
+    application.include_router(router)
+    return application
+app = get_application()

src/create-embedding.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import pandas as pd
+import lancedb
+from lancedb.embeddings import with_embeddings
+from sentence_transformers import SentenceTransformer
+from setting import CFG, AVAILABLE_WORDS
+df = pd.DataFrame(AVAILABLE_WORDS, columns=['word'])
+model = SentenceTransformer(CFG.model.name)
+data = with_embeddings(
+    func=lambda texts: model.encode(texts),
+    data=df, column="word", show_progress=True
+)
+if not os.path.exists(CFG.db.lance_db_folder_path):
+    os.makedirs(CFG.db.lance_db_folder_path)
+db = lancedb.connect(CFG.db.lance_db_folder_path)
+table = db.create_table(CFG.db.table_name, data)
+print("Table created")

src/handlers.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import random
+from fastapi import APIRouter, Depends
+from fastapi.responses import JSONResponse
+from src.api_models import (
+    ResponseGuessWord, ResponseSemanticCalculation,
+    RequestSemanticCalculation, ResponseMessage,
+    SemanticCalculation
+)
+from src.setting import AVAILABLE_WORDS, CFG
+from src.vector_db import VectorDatabaseHandler
+router = APIRouter()
+DEFAULT_RESPONSES = {
+    500: {"description": "Internal Server Error", "model": ResponseMessage},
+}
+@router.get(
+    "/v1/service/status",
+    response_model=ResponseMessage,
+    responses={**DEFAULT_RESPONSES},
+    description="Description: The endpoint is used to check the service status.",
+    tags=["Service Status"]
+)
+async def status() -> ResponseMessage:
+    """Health endpoint."""
+    return ResponseMessage(message="Success.")
+@router.get(
+    "/v1/service/get_guess_word",
+    response_model=ResponseGuessWord,
+    responses={**DEFAULT_RESPONSES},
+    description="Description: The endpoint is used to get a random word from the list of available words.",
+    tags=["Get Word"]
+)
+async def get_guess_word() -> ResponseGuessWord:
+    try:
+        guess_word = random.choices(AVAILABLE_WORDS, k=1)[0]
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"message": str(e)})
+    return ResponseGuessWord(word=guess_word)
+@router.get(
+    "/v1/service/semantic_calculation",
+    response_model=ResponseSemanticCalculation,
+    responses={**DEFAULT_RESPONSES},
+    description="Description: The endpoint is used to calculate the semantic similarity between the guessed word \
+    and the supposed word.",
+    tags=["Semantic Analysis"]
+)
+async def semantic_calculation(
+    request: RequestSemanticCalculation = Depends(RequestSemanticCalculation)
+) -> ResponseGuessWord:
+    supposed_word = request.supposed_word
+    guessed_word = request.guessed_word
+    if supposed_word not in AVAILABLE_WORDS:
+        return ResponseSemanticCalculation(
+            word_exist=False,
+            metadata=None
+        )
+    vector_db = VectorDatabaseHandler(
+        db_path=CFG.db.folder_path,
+        table_name=CFG.db.table_name,
+        metrics_cfg=CFG.db.metrics
+    )
+    try:
+        result = vector_db(guessed_word, supposed_word)
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"message": str(e)})
+    return ResponseSemanticCalculation(
+        word_exist=True,
+        metadata=SemanticCalculation(**result)
+    )

src/setting.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from types import SimpleNamespace
+metrics_cfg = SimpleNamespace(
+    metric="cosine",
+    threshold=0.5,
+)
+db_cfg = SimpleNamespace(
+    db_name="lancedb",
+    table_name="MiniLM-L12-v",
+    folder_path="database",
+    metrics=metrics_cfg
+)
+model_cfg = SimpleNamespace(
+    language="ukr",
+    name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+    task="sentence-transformers",
+)
+CFG = SimpleNamespace(
+    vocab_path="data/ukrainian_nouns.txt",
+    model=model_cfg,
+    db=db_cfg,
+)
+with open(CFG.vocab_path, "r") as file:
+    AVAILABLE_WORDS = [line.strip() for line in file.readlines()]

src/vector_db.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import duckdb
+import lancedb
+from src.setting import AVAILABLE_WORDS
+class VectorDatabaseHandler:
+    QUERY_TEMPLATE = "SELECT word, vector FROM {table_name} WHERE word = '{user_word}'"
+    def __init__(self, db_path: str, table_name: str, metrics_cfg: dict):
+        db = lancedb.connect(db_path)
+        self.metrics_cfg = metrics_cfg
+        self.embeddings_tbl = db.open_table(table_name)
+    def __call__(self, guessed_word: str, supposed_word: str) -> dict:
+        arrow_table = self.embeddings_tbl.to_arrow()
+        word_embedding = self.get_word_vector(guessed_word, "arrow_table")
+        df_emb = self.embeddings_tbl.search(word_embedding) \
+            .metric(self.metrics_cfg.metric) \
+            .limit(len(AVAILABLE_WORDS)) \
+            .to_df()
+        supposed_word_row = df_emb[df_emb['word'] == supposed_word].iloc[0]
+        cosine_distance = supposed_word_row['_distance']
+        words_between_count = len(df_emb[df_emb['_distance'] < cosine_distance])
+        closest_word = df_emb[df_emb['word'] != guessed_word].iloc[0]['word'] if words_between_count else supposed_word
+        return {
+            "score": cosine_distance,
+            "rating": words_between_count,
+            "percentage": 100 - words_between_count / len(df_emb) * 100,
+            "closest_word": closest_word
+        }
+    def get_word_vector(self, word: str, table_name: str):
+        vector = duckdb.query(
+            self.QUERY_TEMPLATE.format(table_name=table_name, user_word=word)
+        ).to_df()["vector"].values[0]
+        return vector