Spaces:

mishig
/

embeddings-similarity

Sleeping

mishig HF Staff commited on Sep 11, 2023

Commit

3d7b683

1 Parent(s): ff6851f

use correct preprocessing

Files changed (1) hide show

app.py CHANGED Viewed

@@ -62,6 +62,12 @@ def create_hnsw_index(embeddings_np, space='ip', ef_construction=100, M=16):
     index.add_items(embeddings_np, ids)
     return index
 app = FastAPI()
 class EmbeddingsSimilarityReq(BaseModel):
@@ -74,7 +80,8 @@ async def find_similar_paragraphsitem(req: EmbeddingsSimilarityReq):
     print("Len of batches", len(req.paragraphs))
     print("creating embeddings", current_timestamp())
-    embeddings_np = get_embeddings([req.query]+req.paragraphs)
     query_embedding, chunks_embeddings = embeddings_np[0], embeddings_np[1:]
     print("creating index", current_timestamp())

     index.add_items(embeddings_np, ids)
     return index
+def preprocess_texts(query, paragraphs):
+    query = f'query: {query}'
+    paragraphs = [f'passage: {p}' for p in paragraphs]
+    return [query]+paragraphs
 app = FastAPI()
 class EmbeddingsSimilarityReq(BaseModel):
     print("Len of batches", len(req.paragraphs))
     print("creating embeddings", current_timestamp())
+    inputs = preprocess_texts(req.query, req.paragraphs)
+    embeddings_np = get_embeddings(inputs)
     query_embedding, chunks_embeddings = embeddings_np[0], embeddings_np[1:]
     print("creating index", current_timestamp())