service-internal commited on
Commit
b5b6a8e
·
verified ·
1 Parent(s): 4f809e4

Upload 3 files

Browse files

Upload app.py, Dockerfile, requirements.txt

Files changed (3) hide show
  1. Dockerfile +28 -0
  2. app (1).py +151 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /code
6
+
7
+ # Set the Hugging Face cache directory to a writable location
8
+ ENV HF_HOME=/tmp/.cache/huggingface
9
+
10
+ # Copy the requirements file into the container at /code
11
+ COPY ./requirements.txt /code/requirements.txt
12
+
13
+ # Install any needed packages specified in requirements.txt
14
+ # --no-cache-dir: Disables the cache to keep the image size smaller
15
+ # --upgrade: Ensures pip is up to date
16
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
17
+
18
+ # Copy the rest of the application's code into the container at /code
19
+ COPY . /code
20
+
21
+ # Expose the port the app will run on. Hugging Face Spaces uses 7860.
22
+ EXPOSE 7860
23
+
24
+ # Command to run the app when the container launches.
25
+ # Uvicorn is the server that runs our FastAPI app.
26
+ # --host 0.0.0.0 makes the app accessible from outside the container.
27
+ # --port 7860 matches the exposed port.
28
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app (1).py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author : Justin
2
+ # Program : Vectorizer for Hybrid Search
3
+ # Instructions : Check README.md
4
+ import torch
5
+ from fastapi import FastAPI
6
+ from pydantic import BaseModel
7
+ from sentence_transformers import SentenceTransformer
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
9
+ from qdrant_client import models
10
+ import logging
11
+ import json
12
+
13
+ # --- Setup Logging ---
14
+ # Configure logging to be more descriptive
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s',
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # --- Configuration ---
22
+ # Local models for vector generation
23
+ DENSE_MODEL_ID = 'sentence-transformers/all-MiniLM-L6-v2'
24
+ # Use the corresponding QUERY encoder for SPLADE, which is optimized for search queries
25
+ SPLADE_QUERY_MODEL_ID = 'naver/efficient-splade-VI-BT-large-query'
26
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+
28
+ # --- Global Variables for Models ---
29
+ # These will be loaded once when the application starts
30
+ dense_model = None
31
+ splade_tokenizer = None
32
+ splade_model = None
33
+
34
+ # --- FastAPI Application ---
35
+ app = FastAPI(
36
+ title="Hybrid Vector Generation API",
37
+ description="An API to generate dense and sparse vectors for a given text query.",
38
+ version="1.2.0"
39
+ )
40
+
41
+ # --- Pydantic Models for API ---
42
+ class QueryRequest(BaseModel):
43
+ """Request model for the API, expecting a single text query."""
44
+ query_text: str
45
+
46
+ class SparseVectorResponse(BaseModel):
47
+ """Response model for the sparse vector."""
48
+ indices: list[int]
49
+ values: list[float]
50
+
51
+ class VectorResponse(BaseModel):
52
+ """Final JSON response model containing both vectors."""
53
+ dense_vector: list[float]
54
+ sparse_vector: SparseVectorResponse
55
+
56
+
57
+ @app.on_event("startup")
58
+ async def load_models():
59
+ """
60
+ Asynchronous event to load ML models on application startup.
61
+ This ensures models are loaded only once.
62
+ """
63
+ global dense_model, splade_tokenizer, splade_model
64
+ logger.info("Server is starting up... Time to load the ML models.")
65
+ logger.info(f"I'll be using the '{DEVICE}' for processing.")
66
+ try:
67
+ dense_model = SentenceTransformer(DENSE_MODEL_ID, device=DEVICE)
68
+ splade_tokenizer = AutoTokenizer.from_pretrained(SPLADE_QUERY_MODEL_ID)
69
+ splade_model = AutoModelForMaskedLM.from_pretrained(SPLADE_QUERY_MODEL_ID).to(DEVICE)
70
+ logger.info("YAaay! All models have been loaded successfully.")
71
+ except Exception as e:
72
+ logger.critical(f"Oh no, a critical error occurred while loading models: {e}", exc_info=True)
73
+ # In a real-world scenario, you might want the app to fail startup if models don't load
74
+ raise e
75
+
76
+ def compute_splade_vector(text: str) -> models.SparseVector:
77
+ """
78
+ Computes a SPLADE sparse vector from a given text query.
79
+
80
+ Args:
81
+ text: The input text string.
82
+ Returns:
83
+ A Qdrant SparseVector object.
84
+ """
85
+ tokens = splade_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
86
+ tokens = {key: val.to(DEVICE) for key, val in tokens.items()} # Move tensors to the correct device
87
+
88
+ with torch.no_grad():
89
+ output = splade_model(**tokens)
90
+
91
+ logits, attention_mask = output.logits, tokens['attention_mask']
92
+ relu_log = torch.log(1 + torch.relu(logits))
93
+ weighted_log = relu_log * attention_mask.unsqueeze(-1)
94
+ max_val, _ = torch.max(weighted_log, dim=1)
95
+ vec = max_val.squeeze()
96
+
97
+ indices = vec.nonzero().squeeze().cpu().tolist()
98
+ values = vec[indices].cpu().tolist()
99
+
100
+ # Ensure indices and values are always lists, even for a single-element tensor
101
+ if not isinstance(indices, list):
102
+ indices = [indices]
103
+ values = [values]
104
+
105
+ return models.SparseVector(indices=indices, values=values)
106
+
107
+
108
+ @app.post("/vectorize", response_model=VectorResponse)
109
+ async def vectorize_query(request: QueryRequest):
110
+ """
111
+ API endpoint to generate and return dense and sparse vectors for a text query.
112
+
113
+ Args:
114
+ request: A QueryRequest object containing the 'query_text'.
115
+
116
+ Returns:
117
+ A JSON response containing the dense and sparse vectors.
118
+ """
119
+ # --- n8n Logging ---
120
+ logger.info("=========================================================")
121
+ logger.info("A new request just arrived! Let's see what we've got.")
122
+ logger.info(f"The incoming search query from n8n is: '{request.query_text}'")
123
+
124
+ # 1. Generate Dense Vector
125
+ logger.info("First, generating the dense vector for semantic meaning...")
126
+ dense_query_vector = dense_model.encode(request.query_text).tolist()
127
+ logger.info("Done with the dense vector. It has %d dimensions.", len(dense_query_vector))
128
+ logger.info("Here's a small sample of the dense vector: %s...", str(dense_query_vector[:4]))
129
+
130
+ # 2. Generate Sparse Vector
131
+ logger.info("Next up, creating the sparse vector for keyword matching...")
132
+ sparse_query_vector = compute_splade_vector(request.query_text)
133
+ logger.info("Sparse vector is ready. It contains %d important terms.", len(sparse_query_vector.indices))
134
+ logger.info("Here's a sample of the sparse vector indices: %s...", str(sparse_query_vector.indices[:4]))
135
+
136
+ # 3. Construct and return the response
137
+ logger.info("Everything looks good. Packaging up the vectors to send back.")
138
+ logger.info("-----------------------------------------------------------------")
139
+
140
+ final_response = VectorResponse(
141
+ dense_vector=dense_query_vector,
142
+ sparse_vector=SparseVectorResponse(
143
+ indices=sparse_query_vector.indices,
144
+ values=sparse_query_vector.values
145
+ )
146
+ )
147
+ return final_response
148
+
149
+ @app.get("/", include_in_schema=False)
150
+ async def root():
151
+ return {"message": "Vector Generation API is running. -- VERSION 2 --"}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ sentence-transformers==4.1.0
4
+ transformers==4.52.4
5
+ torch==2.7.1
6
+ qdrant-client==1.14.2
7
+ accelerate