|
from infinity_emb import AsyncEmbeddingEngine, EngineArgs |
|
import numpy as np |
|
from usearch.index import Index, Matches |
|
import asyncio |
|
import pandas as pd |
|
import os |
|
os.environ["HF_HOME"] = "/app" |
|
os.environ["TRANSFORMERS_CACHE"] = "/app" |
|
os.environ["INFINITY_QUEUE_SIZE"] = "512000" |
|
|
|
engine = AsyncEmbeddingEngine.from_args( |
|
EngineArgs( |
|
model_name_or_path="michaelfeil/jina-embeddings-v2-base-code", |
|
batch_size=8, |
|
) |
|
) |
|
|
|
|
|
async def embed_texts(texts: list[str]) -> np.ndarray: |
|
async with engine: |
|
embeddings = (await engine.embed(texts))[0] |
|
return np.array(embeddings) |
|
|
|
def embed_texts_sync(texts: list[str]) -> np.ndarray: |
|
loop = asyncio.new_event_loop() |
|
return loop.run_until_complete(embed_texts(texts)) |
|
|
|
index = None |
|
docs_index = None |
|
|
|
|
|
def build_index(demo_mode=True): |
|
global index, docs_index |
|
index = Index( |
|
ndim=embed_texts_sync(["Hi"]).shape[ |
|
-1 |
|
], |
|
metric="cos", |
|
dtype="f16", |
|
connectivity=16, |
|
expansion_add=128, |
|
expansion_search=64, |
|
) |
|
if demo_mode: |
|
docs_index = [ |
|
"torch.add(*demo)", |
|
"torch.mul(*demo)", |
|
"torch.div(*demo)", |
|
"torch.sub(*demo)", |
|
] |
|
embeddings = embed_texts_sync(docs_index) |
|
index.add(np.arange(len(docs_index)), embeddings) |
|
return |
|
|
|
|
|
|
|
if index is None: |
|
build_index() |
|
|
|
|
|
def answer_query(query: str) -> list[str]: |
|
embedding = embed_texts_sync([query]) |
|
matches = index.search(embedding, 10) |
|
texts = [docs_index[match.key] for match in matches] |
|
return texts |
|
|
|
|
|
if __name__ == "__main__": |
|
print(answer_query("torch.mul(*demo2)")) |
|
|