Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -74,28 +74,7 @@ async def index(files: List[UploadFile] = File(...)):
|
|
74 |
|
75 |
return {"message": f"Uploaded and converted {len(images)} pages"}
|
76 |
|
77 |
-
|
78 |
-
async def search(query: str, k: int):
|
79 |
-
qs = []
|
80 |
-
with torch.no_grad():
|
81 |
-
batch_query = process_queries(processor, [query], mock_image)
|
82 |
-
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
83 |
-
embeddings_query = model(**batch_query)
|
84 |
-
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
85 |
-
|
86 |
-
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
87 |
-
scores = retriever_evaluator.evaluate(qs, ds)
|
88 |
-
|
89 |
-
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
90 |
-
|
91 |
-
results = []
|
92 |
-
for idx in top_k_indices:
|
93 |
-
img_byte_arr = BytesIO()
|
94 |
-
images[idx].save(img_byte_arr, format='PNG')
|
95 |
-
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
96 |
-
results.append({"image": img_base64, "page": f"Page {idx}"})
|
97 |
-
|
98 |
-
# Generate PDF
|
99 |
pdf_buffer = BytesIO()
|
100 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
101 |
width, height = letter
|
@@ -118,10 +97,78 @@ async def search(query: str, k: int):
|
|
118 |
|
119 |
c.save()
|
120 |
pdf_buffer.seek(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
# Use StreamingResponse to handle in-memory file
|
123 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
124 |
-
response.headers['Content-Disposition'] = 'attachment; filename="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
return response
|
127 |
|
|
|
74 |
|
75 |
return {"message": f"Uploaded and converted {len(images)} pages"}
|
76 |
|
77 |
+
def generate_pdf(results):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
pdf_buffer = BytesIO()
|
79 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
80 |
width, height = letter
|
|
|
97 |
|
98 |
c.save()
|
99 |
pdf_buffer.seek(0)
|
100 |
+
return pdf_buffer
|
101 |
+
|
102 |
+
@app.get("/search")
|
103 |
+
async def search(query: str, k: int = 1):
|
104 |
+
qs = []
|
105 |
+
with torch.no_grad():
|
106 |
+
batch_query = process_queries(processor, [query], mock_image)
|
107 |
+
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
108 |
+
embeddings_query = model(**batch_query)
|
109 |
+
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
110 |
+
|
111 |
+
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
112 |
+
scores = retriever_evaluator.evaluate(qs, ds)
|
113 |
+
|
114 |
+
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
115 |
+
|
116 |
+
results = []
|
117 |
+
for idx in top_k_indices:
|
118 |
+
img_byte_arr = BytesIO()
|
119 |
+
images[idx].save(img_byte_arr, format='PNG')
|
120 |
+
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
121 |
+
results.append({"image": img_base64, "page": f"Page {idx}"})
|
122 |
+
|
123 |
+
pdf_buffer = generate_pdf(results)
|
124 |
|
125 |
# Use StreamingResponse to handle in-memory file
|
126 |
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
127 |
+
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
128 |
+
|
129 |
+
return response
|
130 |
+
|
131 |
+
@app.get("/search_by_cv")
|
132 |
+
async def search_by_cv(file: UploadFile = File(...), k: int = 10):
|
133 |
+
# Lire le fichier PDF uploadé
|
134 |
+
content = await file.read()
|
135 |
+
pdf_image_list = convert_from_bytes(content)
|
136 |
+
|
137 |
+
# Générer les embeddings pour les pages du PDF uploadé
|
138 |
+
qs = []
|
139 |
+
dataloader = DataLoader(
|
140 |
+
pdf_image_list,
|
141 |
+
batch_size=4,
|
142 |
+
shuffle=False,
|
143 |
+
collate_fn=lambda x: process_images(processor, x),
|
144 |
+
)
|
145 |
+
for batch_query in dataloader:
|
146 |
+
with torch.no_grad():
|
147 |
+
batch_query = {k: v.to(device) for k, v in batch_query.items()}
|
148 |
+
embeddings_query = model(**batch_query)
|
149 |
+
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
150 |
+
|
151 |
+
# Comparer les embeddings du CV uploadé avec ceux déjà indexés
|
152 |
+
retriever_evaluator = CustomEvaluator(is_multi_vector=True)
|
153 |
+
scores = retriever_evaluator.evaluate(qs, ds)
|
154 |
+
|
155 |
+
# Trouver les indices des résultats les plus pertinents
|
156 |
+
top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
|
157 |
+
|
158 |
+
# Préparer les résultats sous forme d'images
|
159 |
+
results = []
|
160 |
+
for idx in top_k_indices:
|
161 |
+
img_byte_arr = BytesIO()
|
162 |
+
images[idx].save(img_byte_arr, format='PNG')
|
163 |
+
img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
164 |
+
results.append({"image": img_base64, "page": f"Page {idx}"})
|
165 |
+
|
166 |
+
# Générer le PDF des résultats
|
167 |
+
pdf_buffer = generate_pdf(results)
|
168 |
+
|
169 |
+
# Utiliser StreamingResponse pour renvoyer le fichier PDF généré
|
170 |
+
response = StreamingResponse(pdf_buffer, media_type='application/pdf')
|
171 |
+
response.headers['Content-Disposition'] = 'attachment; filename="results.pdf"'
|
172 |
|
173 |
return response
|
174 |
|