Upload 3 files
Browse filesUpload app.py, Dockerfile, requirements.txt
- Dockerfile +28 -0
- app (1).py +151 -0
- requirements.txt +7 -0
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /code
|
6 |
+
|
7 |
+
# Set the Hugging Face cache directory to a writable location
|
8 |
+
ENV HF_HOME=/tmp/.cache/huggingface
|
9 |
+
|
10 |
+
# Copy the requirements file into the container at /code
|
11 |
+
COPY ./requirements.txt /code/requirements.txt
|
12 |
+
|
13 |
+
# Install any needed packages specified in requirements.txt
|
14 |
+
# --no-cache-dir: Disables the cache to keep the image size smaller
|
15 |
+
# --upgrade: Ensures pip is up to date
|
16 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
17 |
+
|
18 |
+
# Copy the rest of the application's code into the container at /code
|
19 |
+
COPY . /code
|
20 |
+
|
21 |
+
# Expose the port the app will run on. Hugging Face Spaces uses 7860.
|
22 |
+
EXPOSE 7860
|
23 |
+
|
24 |
+
# Command to run the app when the container launches.
|
25 |
+
# Uvicorn is the server that runs our FastAPI app.
|
26 |
+
# --host 0.0.0.0 makes the app accessible from outside the container.
|
27 |
+
# --port 7860 matches the exposed port.
|
28 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app (1).py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Author : Justin
|
2 |
+
# Program : Vectorizer for Hybrid Search
|
3 |
+
# Instructions : Check README.md
|
4 |
+
import torch
|
5 |
+
from fastapi import FastAPI
|
6 |
+
from pydantic import BaseModel
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
9 |
+
from qdrant_client import models
|
10 |
+
import logging
|
11 |
+
import json
|
12 |
+
|
13 |
+
# --- Setup Logging ---
|
14 |
+
# Configure logging to be more descriptive
|
15 |
+
logging.basicConfig(
|
16 |
+
level=logging.INFO,
|
17 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
18 |
+
)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# --- Configuration ---
|
22 |
+
# Local models for vector generation
|
23 |
+
DENSE_MODEL_ID = 'sentence-transformers/all-MiniLM-L6-v2'
|
24 |
+
# Use the corresponding QUERY encoder for SPLADE, which is optimized for search queries
|
25 |
+
SPLADE_QUERY_MODEL_ID = 'naver/efficient-splade-VI-BT-large-query'
|
26 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
27 |
+
|
28 |
+
# --- Global Variables for Models ---
|
29 |
+
# These will be loaded once when the application starts
|
30 |
+
dense_model = None
|
31 |
+
splade_tokenizer = None
|
32 |
+
splade_model = None
|
33 |
+
|
34 |
+
# --- FastAPI Application ---
|
35 |
+
app = FastAPI(
|
36 |
+
title="Hybrid Vector Generation API",
|
37 |
+
description="An API to generate dense and sparse vectors for a given text query.",
|
38 |
+
version="1.2.0"
|
39 |
+
)
|
40 |
+
|
41 |
+
# --- Pydantic Models for API ---
|
42 |
+
class QueryRequest(BaseModel):
|
43 |
+
"""Request model for the API, expecting a single text query."""
|
44 |
+
query_text: str
|
45 |
+
|
46 |
+
class SparseVectorResponse(BaseModel):
|
47 |
+
"""Response model for the sparse vector."""
|
48 |
+
indices: list[int]
|
49 |
+
values: list[float]
|
50 |
+
|
51 |
+
class VectorResponse(BaseModel):
|
52 |
+
"""Final JSON response model containing both vectors."""
|
53 |
+
dense_vector: list[float]
|
54 |
+
sparse_vector: SparseVectorResponse
|
55 |
+
|
56 |
+
|
57 |
+
@app.on_event("startup")
|
58 |
+
async def load_models():
|
59 |
+
"""
|
60 |
+
Asynchronous event to load ML models on application startup.
|
61 |
+
This ensures models are loaded only once.
|
62 |
+
"""
|
63 |
+
global dense_model, splade_tokenizer, splade_model
|
64 |
+
logger.info("Server is starting up... Time to load the ML models.")
|
65 |
+
logger.info(f"I'll be using the '{DEVICE}' for processing.")
|
66 |
+
try:
|
67 |
+
dense_model = SentenceTransformer(DENSE_MODEL_ID, device=DEVICE)
|
68 |
+
splade_tokenizer = AutoTokenizer.from_pretrained(SPLADE_QUERY_MODEL_ID)
|
69 |
+
splade_model = AutoModelForMaskedLM.from_pretrained(SPLADE_QUERY_MODEL_ID).to(DEVICE)
|
70 |
+
logger.info("YAaay! All models have been loaded successfully.")
|
71 |
+
except Exception as e:
|
72 |
+
logger.critical(f"Oh no, a critical error occurred while loading models: {e}", exc_info=True)
|
73 |
+
# In a real-world scenario, you might want the app to fail startup if models don't load
|
74 |
+
raise e
|
75 |
+
|
76 |
+
def compute_splade_vector(text: str) -> models.SparseVector:
|
77 |
+
"""
|
78 |
+
Computes a SPLADE sparse vector from a given text query.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
text: The input text string.
|
82 |
+
Returns:
|
83 |
+
A Qdrant SparseVector object.
|
84 |
+
"""
|
85 |
+
tokens = splade_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
86 |
+
tokens = {key: val.to(DEVICE) for key, val in tokens.items()} # Move tensors to the correct device
|
87 |
+
|
88 |
+
with torch.no_grad():
|
89 |
+
output = splade_model(**tokens)
|
90 |
+
|
91 |
+
logits, attention_mask = output.logits, tokens['attention_mask']
|
92 |
+
relu_log = torch.log(1 + torch.relu(logits))
|
93 |
+
weighted_log = relu_log * attention_mask.unsqueeze(-1)
|
94 |
+
max_val, _ = torch.max(weighted_log, dim=1)
|
95 |
+
vec = max_val.squeeze()
|
96 |
+
|
97 |
+
indices = vec.nonzero().squeeze().cpu().tolist()
|
98 |
+
values = vec[indices].cpu().tolist()
|
99 |
+
|
100 |
+
# Ensure indices and values are always lists, even for a single-element tensor
|
101 |
+
if not isinstance(indices, list):
|
102 |
+
indices = [indices]
|
103 |
+
values = [values]
|
104 |
+
|
105 |
+
return models.SparseVector(indices=indices, values=values)
|
106 |
+
|
107 |
+
|
108 |
+
@app.post("/vectorize", response_model=VectorResponse)
|
109 |
+
async def vectorize_query(request: QueryRequest):
|
110 |
+
"""
|
111 |
+
API endpoint to generate and return dense and sparse vectors for a text query.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
request: A QueryRequest object containing the 'query_text'.
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
A JSON response containing the dense and sparse vectors.
|
118 |
+
"""
|
119 |
+
# --- n8n Logging ---
|
120 |
+
logger.info("=========================================================")
|
121 |
+
logger.info("A new request just arrived! Let's see what we've got.")
|
122 |
+
logger.info(f"The incoming search query from n8n is: '{request.query_text}'")
|
123 |
+
|
124 |
+
# 1. Generate Dense Vector
|
125 |
+
logger.info("First, generating the dense vector for semantic meaning...")
|
126 |
+
dense_query_vector = dense_model.encode(request.query_text).tolist()
|
127 |
+
logger.info("Done with the dense vector. It has %d dimensions.", len(dense_query_vector))
|
128 |
+
logger.info("Here's a small sample of the dense vector: %s...", str(dense_query_vector[:4]))
|
129 |
+
|
130 |
+
# 2. Generate Sparse Vector
|
131 |
+
logger.info("Next up, creating the sparse vector for keyword matching...")
|
132 |
+
sparse_query_vector = compute_splade_vector(request.query_text)
|
133 |
+
logger.info("Sparse vector is ready. It contains %d important terms.", len(sparse_query_vector.indices))
|
134 |
+
logger.info("Here's a sample of the sparse vector indices: %s...", str(sparse_query_vector.indices[:4]))
|
135 |
+
|
136 |
+
# 3. Construct and return the response
|
137 |
+
logger.info("Everything looks good. Packaging up the vectors to send back.")
|
138 |
+
logger.info("-----------------------------------------------------------------")
|
139 |
+
|
140 |
+
final_response = VectorResponse(
|
141 |
+
dense_vector=dense_query_vector,
|
142 |
+
sparse_vector=SparseVectorResponse(
|
143 |
+
indices=sparse_query_vector.indices,
|
144 |
+
values=sparse_query_vector.values
|
145 |
+
)
|
146 |
+
)
|
147 |
+
return final_response
|
148 |
+
|
149 |
+
@app.get("/", include_in_schema=False)
|
150 |
+
async def root():
|
151 |
+
return {"message": "Vector Generation API is running. -- VERSION 2 --"}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
sentence-transformers==4.1.0
|
4 |
+
transformers==4.52.4
|
5 |
+
torch==2.7.1
|
6 |
+
qdrant-client==1.14.2
|
7 |
+
accelerate
|