Spaces:

sohamnk
/

lost-and-found-ai-pipeline

Running

App Files Files Community

sohamnk commited on 30 days ago

Commit

754ba00

verified ·

1 Parent(s): ee57fa4

Update pipeline/routes.py

Browse files

Files changed (1) hide show

pipeline/routes.py +90 -55

pipeline/routes.py CHANGED Viewed

@@ -47,6 +47,10 @@ def process_item():
         traceback.print_exc()
         return jsonify({"error": str(e)}), 500
 @app.route('/compare', methods=['POST'])
 def compare_items():
     print("\n" + "="*50)
@@ -61,65 +65,96 @@ def compare_items():
         if not all([query_item, search_list]):
             return jsonify({"error": "queryItem and searchList are required."}), 400
         query_text_emb = np.array(query_item['text_embedding'])
-        results = []
-        print(f"--- Comparing 1 query item against {len(search_list)} items ---")
         for item in search_list:
-            item_id = item.get('_id')
-            print(f"\n [Checking] Item ID: {item_id}")
-            try:
-                text_emb_found = np.array(item['text_embedding'])
-                text_score = logic.cosine_similarity(query_text_emb, text_emb_found)
-                print(f" - Text Score: {text_score:.4f}")
-                has_query_image = 'shape_features' in query_item and query_item['shape_features']
-                has_item_image = 'shape_features' in item and item['shape_features']
-                if has_query_image and has_item_image:
-                    print(" - Both items have images. Performing visual comparison.")
-                    from pipeline import FEATURE_WEIGHTS # Import constant
-                    query_shape = np.array(query_item['shape_features'])
-                    query_color = np.array(query_item['color_features']).astype("float32")
-                    query_texture = np.array(query_item['texture_features']).astype("float32")
-                    found_shape = np.array(item['shape_features'])
-                    found_color = np.array(item['color_features']).astype("float32")
-                    found_texture = np.array(item['texture_features']).astype("float32")
-                    shape_dist = cv2.matchShapes(query_shape, found_shape, cv2.CONTOURS_MATCH_I1, 0.0)
-                    shape_score = 1.0 / (1.0 + shape_dist)
-                    color_score = cv2.compareHist(query_color, found_color, cv2.HISTCMP_CORREL)
-                    texture_score = cv2.compareHist(query_texture, found_texture, cv2.HISTCMP_CORREL)
-                    raw_image_score = (FEATURE_WEIGHTS["shape"] * shape_score +
-                                       FEATURE_WEIGHTS["color"] * color_score +
-                                       FEATURE_WEIGHTS["texture"] * texture_score)
-                    print(f"Raw Image Score: {raw_image_score:.4f}")
-                    image_score = logic.stretch_image_score(raw_image_score)
-                    final_score = 0.4 * image_score + 0.6 * text_score
-                    print(f" - Image Score: {image_score:.4f} | Final Score: {final_score:.4f}")
-                else:
-                    print(" - One or both items missing image. Using text score only.")
-                    final_score = text_score
-                from pipeline import FINAL_SCORE_THRESHOLD # Import constant
-                if final_score >= FINAL_SCORE_THRESHOLD:
-                    print(f" - ✅ ACCEPTED (Score >= {FINAL_SCORE_THRESHOLD})")
-                    results.append({
-                        "_id": item_id,
-                        "score": round(final_score, 4),
-                        "objectName": item.get("objectName"),
-                        "objectDescription": item.get("objectDescription"),
-                        "objectImage": item.get("objectImage"),
-                    })
-                else:
-                    print(f" - ❌ REJECTED (Score < {FINAL_SCORE_THRESHOLD})")
-            except Exception as e:
-                print(f" [Skipping] Item {item_id} due to processing error: {e}")
-                continue
-        results.sort(key=lambda x: x["score"], reverse=True)
-        print(f"\n✅ Search complete. Found {len(results)} potential matches.")
         print("="*50)
-        return jsonify({"matches": results}), 200
     except Exception as e:
         print(f"❌ Error in /compare: {e}")

         traceback.print_exc()
         return jsonify({"error": str(e)}), 500
+@app.route('/compare', methods=['POST'])
+# Add a new constant at the top of the file
+TOP_N_CANDIDATES = 20 # The number of items to re-rank
 @app.route('/compare', methods=['POST'])
 def compare_items():
     print("\n" + "="*50)
         if not all([query_item, search_list]):
             return jsonify({"error": "queryItem and searchList are required."}), 400
+        # === STAGE 1: FAST RETRIEVAL (using Bi-Encoder) ===
+        print(f"--- Stage 1: Retrieving top candidates from {len(search_list)} items... ---")
+        initial_candidates = []
         query_text_emb = np.array(query_item['text_embedding'])
         for item in search_list:
+            text_emb_found = np.array(item['text_embedding'])
+            text_score = logic.cosine_similarity(query_text_emb, text_emb_found)
+            # For now, just use the text_score as the initial score
+            # We will calculate the full score later for the top candidates
+            initial_candidates.append({"item": item, "initial_score": text_score})
+        # Sort by the initial score and keep the best ones
+        initial_candidates.sort(key=lambda x: x["initial_score"], reverse=True)
+        top_candidates = initial_candidates[:TOP_N_CANDIDATES]
+        print(f"--- Found {len(top_candidates)} candidates for re-ranking. ---")
+        # === STAGE 2: ACCURATE RE-RANKING (using Cross-Encoder) ===
+        if not top_candidates:
+            print("✅ No potential matches found in Stage 1.")
+            return jsonify({"matches": []}), 200
+        print(f"\n--- Stage 2: Re-ranking top {len(top_candidates)} candidates... ---")
+        query_description = query_item['objectDescription']
+        # Create pairs of [query, candidate_description] for the cross-encoder
+        rerank_pairs = [(query_description, cand['item']['objectDescription']) for cand in top_candidates]
+        # Get new, highly accurate scores from the cross-encoder
+        cross_encoder_scores = models['cross_encoder'].predict(rerank_pairs)
+        # Now, build the final results with the new scores
+        final_results = []
+        for i, candidate_data in enumerate(top_candidates):
+            item = candidate_data['item']
+            cross_score = cross_encoder_scores[i] # Get the new text score
+            print(f"\n [Re-Ranking] Item ID: {item.get('_id')}")
+            print(f" - Cross-Encoder Score: {cross_score:.4f}")
+            # Now we calculate the final image and combined score, just like before
+            has_query_image = 'shape_features' in query_item and query_item['shape_features']
+            has_item_image = 'shape_features' in item and item['shape_features']
+            if has_query_image and has_item_image:
+                # (This image scoring logic is the same as your old code)
+                from pipeline import FEATURE_WEIGHTS
+                query_shape = np.array(query_item['shape_features'])
+                query_color = np.array(query_item['color_features']).astype("float32")
+                query_texture = np.array(query_item['texture_features']).astype("float32")
+                found_shape = np.array(item['shape_features'])
+                found_color = np.array(item['color_features']).astype("float32")
+                found_texture = np.array(item['texture_features']).astype("float32")
+                shape_dist = cv2.matchShapes(query_shape, found_shape, cv2.CONTOURS_MATCH_I1, 0.0)
+                shape_score = 1.0 / (1.0 + shape_dist)
+                color_score = cv2.compareHist(query_color, found_color, cv2.HISTCMP_CORREL)
+                texture_score = cv2.compareHist(query_texture, found_texture, cv2.HISTCMP_CORREL)
+                raw_image_score = (FEATURE_WEIGHTS["shape"] * shape_score +
+                                   FEATURE_WEIGHTS["color"] * color_score +
+                                   FEATURE_WEIGHTS["texture"] * texture_score)
+                image_score = logic.stretch_image_score(raw_image_score)
+                # Use the new cross_score for the text part
+                final_score = 0.4 * image_score + 0.6 * cross_score
+                print(f" - Image Score: {image_score:.4f} | Final Re-ranked Score: {final_score:.4f}")
+            else:
+                final_score = cross_score # If no image, the final score is the cross-encoder score
+            from pipeline import FINAL_SCORE_THRESHOLD
+            if final_score >= FINAL_SCORE_THRESHOLD:
+                print(f" - ✅ ACCEPTED (Score >= {FINAL_SCORE_THRESHOLD})")
+                final_results.append({
+                    "_id": item.get('_id'),
+                    "score": round(final_score, 4),
+                    "objectName": item.get("objectName"),
+                    "objectDescription": item.get("objectDescription"),
+                    "objectImage": item.get("objectImage"),
+                })
+            else:
+                print(f" - ❌ REJECTED (Score < {FINAL_SCORE_THRESHOLD})")
+        final_results.sort(key=lambda x: x["score"], reverse=True)
+        print(f"\n✅ Search complete. Found {len(final_results)} final matches after re-ranking.")
         print("="*50)
+        return jsonify({"matches": final_results}), 200
+    except Exception as e:
+        print(f"❌ Error in /compare: {e}")
+        traceback.print_exc()
+        return jsonify({"error": str(e)}), 500
     except Exception as e:
         print(f"❌ Error in /compare: {e}")