feat(API): Implemented basic functionality.
Browse files- Dockerfile +15 -0
- data/ukrainian_nouns.txt +19 -0
- database/MiniLM-L12-v.lance/_latest.manifest +0 -0
- database/MiniLM-L12-v.lance/_transactions/0-5e898e5f-189c-473e-892d-8b5947b6a369.txn +1 -0
- database/MiniLM-L12-v.lance/_transactions/1-9077628d-c42c-4413-8bd1-cc31ea726bce.txn +0 -0
- database/MiniLM-L12-v.lance/_versions/1.manifest +0 -0
- database/MiniLM-L12-v.lance/_versions/2.manifest +0 -0
- database/MiniLM-L12-v.lance/data/a1b860b1-c7a3-4314-9c51-38ad78b5de8b.lance +0 -0
- requirements.txt +6 -0
- src/.DS_Store +0 -0
- src/__pycache__/api_models.cpython-311.pyc +0 -0
- src/__pycache__/app.cpython-311.pyc +0 -0
- src/__pycache__/handlers.cpython-311.pyc +0 -0
- src/__pycache__/setting.cpython-311.pyc +0 -0
- src/__pycache__/vector_db.cpython-311.pyc +0 -0
- src/api_models.py +38 -0
- src/app.py +13 -0
- src/create-embedding.py +25 -0
- src/handlers.py +81 -0
- src/setting.py +28 -0
- src/vector_db.py +42 -0
Dockerfile
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
COPY requirements.txt ./requirements.txt
|
4 |
+
|
5 |
+
RUN python -m pip install -U pip && \
|
6 |
+
python -m pip install -r requirements.txt && \
|
7 |
+
python -m pip cache purge
|
8 |
+
|
9 |
+
COPY ./data /app/data
|
10 |
+
COPY ./database /app/database
|
11 |
+
COPY ./src /app/src
|
12 |
+
|
13 |
+
WORKDIR /app
|
14 |
+
|
15 |
+
CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
data/ukrainian_nouns.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
лікар
|
2 |
+
програміст
|
3 |
+
пілот
|
4 |
+
літак
|
5 |
+
висота
|
6 |
+
тиск
|
7 |
+
барометр
|
8 |
+
вітер
|
9 |
+
медицина
|
10 |
+
комп'ютер
|
11 |
+
рука
|
12 |
+
око
|
13 |
+
ніс
|
14 |
+
книга
|
15 |
+
папір
|
16 |
+
олівець
|
17 |
+
Франція
|
18 |
+
Париж
|
19 |
+
Германія
|
database/MiniLM-L12-v.lance/_latest.manifest
ADDED
Binary file (497 Bytes). View file
|
|
database/MiniLM-L12-v.lance/_transactions/0-5e898e5f-189c-473e-892d-8b5947b6a369.txn
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
$5e898e5f-189c-473e-892d-8b5947b6a369�Uword ���������*string084vector ���������*fixed_size_list:float:38408
|
database/MiniLM-L12-v.lance/_transactions/1-9077628d-c42c-4413-8bd1-cc31ea726bce.txn
ADDED
Binary file (97 Bytes). View file
|
|
database/MiniLM-L12-v.lance/_versions/1.manifest
ADDED
Binary file (443 Bytes). View file
|
|
database/MiniLM-L12-v.lance/_versions/2.manifest
ADDED
Binary file (497 Bytes). View file
|
|
database/MiniLM-L12-v.lance/data/a1b860b1-c7a3-4314-9c51-38ad78b5de8b.lance
ADDED
Binary file (30.2 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
duckdb==0.10.1
|
2 |
+
fastapi==0.110.0
|
3 |
+
pandas==2.2.1
|
4 |
+
lancedb==0.6.4
|
5 |
+
sentence-transformers==2.5.1
|
6 |
+
uvicorn==0.29.0
|
src/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/__pycache__/api_models.cpython-311.pyc
ADDED
Binary file (2.32 kB). View file
|
|
src/__pycache__/app.cpython-311.pyc
ADDED
Binary file (600 Bytes). View file
|
|
src/__pycache__/handlers.cpython-311.pyc
ADDED
Binary file (3.7 kB). View file
|
|
src/__pycache__/setting.cpython-311.pyc
ADDED
Binary file (1.33 kB). View file
|
|
src/__pycache__/vector_db.cpython-311.pyc
ADDED
Binary file (2.91 kB). View file
|
|
src/api_models.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
|
3 |
+
|
4 |
+
class ResponseModel(BaseModel):
|
5 |
+
message: str
|
6 |
+
data: dict
|
7 |
+
code: int
|
8 |
+
|
9 |
+
|
10 |
+
class ResponseGuessWord(BaseModel):
|
11 |
+
word: str
|
12 |
+
|
13 |
+
|
14 |
+
class RequestSemanticCalculation(BaseModel):
|
15 |
+
supposed_word: str = Field(
|
16 |
+
description="The word that the user is trying to guess",
|
17 |
+
example="ніс"
|
18 |
+
)
|
19 |
+
guessed_word: str = Field(
|
20 |
+
description="The word that the user guessed",
|
21 |
+
example="око"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
class SemanticCalculation(BaseModel):
|
26 |
+
score: float
|
27 |
+
rating: int
|
28 |
+
percentage: float
|
29 |
+
closest_word: str
|
30 |
+
|
31 |
+
|
32 |
+
class ResponseSemanticCalculation(BaseModel):
|
33 |
+
word_exist: bool
|
34 |
+
metadata: SemanticCalculation | None
|
35 |
+
|
36 |
+
|
37 |
+
class ResponseMessage(BaseModel):
|
38 |
+
message: str
|
src/app.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
|
3 |
+
from src.handlers import router
|
4 |
+
|
5 |
+
|
6 |
+
def get_application() -> FastAPI:
|
7 |
+
application = FastAPI()
|
8 |
+
application.include_router(router)
|
9 |
+
|
10 |
+
return application
|
11 |
+
|
12 |
+
|
13 |
+
app = get_application()
|
src/create-embedding.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import lancedb
|
5 |
+
from lancedb.embeddings import with_embeddings
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
|
8 |
+
from setting import CFG, AVAILABLE_WORDS
|
9 |
+
|
10 |
+
|
11 |
+
df = pd.DataFrame(AVAILABLE_WORDS, columns=['word'])
|
12 |
+
|
13 |
+
model = SentenceTransformer(CFG.model.name)
|
14 |
+
|
15 |
+
data = with_embeddings(
|
16 |
+
func=lambda texts: model.encode(texts),
|
17 |
+
data=df, column="word", show_progress=True
|
18 |
+
)
|
19 |
+
|
20 |
+
if not os.path.exists(CFG.db.lance_db_folder_path):
|
21 |
+
os.makedirs(CFG.db.lance_db_folder_path)
|
22 |
+
|
23 |
+
db = lancedb.connect(CFG.db.lance_db_folder_path)
|
24 |
+
table = db.create_table(CFG.db.table_name, data)
|
25 |
+
print("Table created")
|
src/handlers.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
from fastapi import APIRouter, Depends
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
|
6 |
+
from src.api_models import (
|
7 |
+
ResponseGuessWord, ResponseSemanticCalculation,
|
8 |
+
RequestSemanticCalculation, ResponseMessage,
|
9 |
+
SemanticCalculation
|
10 |
+
)
|
11 |
+
from src.setting import AVAILABLE_WORDS, CFG
|
12 |
+
from src.vector_db import VectorDatabaseHandler
|
13 |
+
|
14 |
+
router = APIRouter()
|
15 |
+
|
16 |
+
DEFAULT_RESPONSES = {
|
17 |
+
500: {"description": "Internal Server Error", "model": ResponseMessage},
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
@router.get(
|
22 |
+
"/v1/service/status",
|
23 |
+
response_model=ResponseMessage,
|
24 |
+
responses={**DEFAULT_RESPONSES},
|
25 |
+
description="Description: The endpoint is used to check the service status.",
|
26 |
+
tags=["Service Status"]
|
27 |
+
)
|
28 |
+
async def status() -> ResponseMessage:
|
29 |
+
"""Health endpoint."""
|
30 |
+
return ResponseMessage(message="Success.")
|
31 |
+
|
32 |
+
|
33 |
+
@router.get(
|
34 |
+
"/v1/service/get_guess_word",
|
35 |
+
response_model=ResponseGuessWord,
|
36 |
+
responses={**DEFAULT_RESPONSES},
|
37 |
+
description="Description: The endpoint is used to get a random word from the list of available words.",
|
38 |
+
tags=["Get Word"]
|
39 |
+
)
|
40 |
+
async def get_guess_word() -> ResponseGuessWord:
|
41 |
+
try:
|
42 |
+
guess_word = random.choices(AVAILABLE_WORDS, k=1)[0]
|
43 |
+
except Exception as e:
|
44 |
+
return JSONResponse(status_code=500, content={"message": str(e)})
|
45 |
+
return ResponseGuessWord(word=guess_word)
|
46 |
+
|
47 |
+
|
48 |
+
@router.get(
|
49 |
+
"/v1/service/semantic_calculation",
|
50 |
+
response_model=ResponseSemanticCalculation,
|
51 |
+
responses={**DEFAULT_RESPONSES},
|
52 |
+
description="Description: The endpoint is used to calculate the semantic similarity between the guessed word \
|
53 |
+
and the supposed word.",
|
54 |
+
tags=["Semantic Analysis"]
|
55 |
+
)
|
56 |
+
async def semantic_calculation(
|
57 |
+
request: RequestSemanticCalculation = Depends(RequestSemanticCalculation)
|
58 |
+
) -> ResponseGuessWord:
|
59 |
+
supposed_word = request.supposed_word
|
60 |
+
guessed_word = request.guessed_word
|
61 |
+
|
62 |
+
if supposed_word not in AVAILABLE_WORDS:
|
63 |
+
return ResponseSemanticCalculation(
|
64 |
+
word_exist=False,
|
65 |
+
metadata=None
|
66 |
+
)
|
67 |
+
|
68 |
+
vector_db = VectorDatabaseHandler(
|
69 |
+
db_path=CFG.db.folder_path,
|
70 |
+
table_name=CFG.db.table_name,
|
71 |
+
metrics_cfg=CFG.db.metrics
|
72 |
+
)
|
73 |
+
|
74 |
+
try:
|
75 |
+
result = vector_db(guessed_word, supposed_word)
|
76 |
+
except Exception as e:
|
77 |
+
return JSONResponse(status_code=500, content={"message": str(e)})
|
78 |
+
return ResponseSemanticCalculation(
|
79 |
+
word_exist=True,
|
80 |
+
metadata=SemanticCalculation(**result)
|
81 |
+
)
|
src/setting.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from types import SimpleNamespace
|
2 |
+
|
3 |
+
metrics_cfg = SimpleNamespace(
|
4 |
+
metric="cosine",
|
5 |
+
threshold=0.5,
|
6 |
+
)
|
7 |
+
|
8 |
+
db_cfg = SimpleNamespace(
|
9 |
+
db_name="lancedb",
|
10 |
+
table_name="MiniLM-L12-v",
|
11 |
+
folder_path="database",
|
12 |
+
metrics=metrics_cfg
|
13 |
+
)
|
14 |
+
|
15 |
+
model_cfg = SimpleNamespace(
|
16 |
+
language="ukr",
|
17 |
+
name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
18 |
+
task="sentence-transformers",
|
19 |
+
)
|
20 |
+
|
21 |
+
CFG = SimpleNamespace(
|
22 |
+
vocab_path="data/ukrainian_nouns.txt",
|
23 |
+
model=model_cfg,
|
24 |
+
db=db_cfg,
|
25 |
+
)
|
26 |
+
|
27 |
+
with open(CFG.vocab_path, "r") as file:
|
28 |
+
AVAILABLE_WORDS = [line.strip() for line in file.readlines()]
|
src/vector_db.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import duckdb
|
2 |
+
import lancedb
|
3 |
+
|
4 |
+
from src.setting import AVAILABLE_WORDS
|
5 |
+
|
6 |
+
|
7 |
+
class VectorDatabaseHandler:
|
8 |
+
QUERY_TEMPLATE = "SELECT word, vector FROM {table_name} WHERE word = '{user_word}'"
|
9 |
+
|
10 |
+
def __init__(self, db_path: str, table_name: str, metrics_cfg: dict):
|
11 |
+
db = lancedb.connect(db_path)
|
12 |
+
|
13 |
+
self.metrics_cfg = metrics_cfg
|
14 |
+
self.embeddings_tbl = db.open_table(table_name)
|
15 |
+
|
16 |
+
def __call__(self, guessed_word: str, supposed_word: str) -> dict:
|
17 |
+
arrow_table = self.embeddings_tbl.to_arrow()
|
18 |
+
word_embedding = self.get_word_vector(guessed_word, "arrow_table")
|
19 |
+
|
20 |
+
df_emb = self.embeddings_tbl.search(word_embedding) \
|
21 |
+
.metric(self.metrics_cfg.metric) \
|
22 |
+
.limit(len(AVAILABLE_WORDS)) \
|
23 |
+
.to_df()
|
24 |
+
|
25 |
+
supposed_word_row = df_emb[df_emb['word'] == supposed_word].iloc[0]
|
26 |
+
cosine_distance = supposed_word_row['_distance']
|
27 |
+
|
28 |
+
words_between_count = len(df_emb[df_emb['_distance'] < cosine_distance])
|
29 |
+
closest_word = df_emb[df_emb['word'] != guessed_word].iloc[0]['word'] if words_between_count else supposed_word
|
30 |
+
|
31 |
+
return {
|
32 |
+
"score": cosine_distance,
|
33 |
+
"rating": words_between_count,
|
34 |
+
"percentage": 100 - words_between_count / len(df_emb) * 100,
|
35 |
+
"closest_word": closest_word
|
36 |
+
}
|
37 |
+
|
38 |
+
def get_word_vector(self, word: str, table_name: str):
|
39 |
+
vector = duckdb.query(
|
40 |
+
self.QUERY_TEMPLATE.format(table_name=table_name, user_word=word)
|
41 |
+
).to_df()["vector"].values[0]
|
42 |
+
return vector
|