Spaces:
Runtime error
Runtime error
| import json | |
| import torch | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Load once at import time | |
| WEIGHTS = torch.load("tensor.pt").detach().numpy() # shape: (V, D) | |
| with open("vocab.json", "r") as f: | |
| TOKEN2IDX = json.load(f) | |
| # Build reverse map: idx (as int) → token (str) | |
| IDX2TOKEN = {int(i): w for w, i in TOKEN2IDX.items()} | |
| def chat(question: str) -> str: | |
| """ | |
| Embedding Q&A stub: | |
| - Tokenize by whitespace | |
| - Lookup embeddings | |
| - Average them | |
| - Find nearest token in vocab | |
| """ | |
| # Simple whitespace tokenizer; you can improve this later | |
| tokens = question.lower().split() | |
| # Map to indices, drop unknowns | |
| idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX] | |
| if not idxs: | |
| return "🤔 I don't recognize any of those words." | |
| # Average embedding vector | |
| q_embed = np.mean(WEIGHTS[idxs], axis=0, keepdims=True) | |
| # Cosine‐similarity against all vocab embeddings | |
| sims = cosine_similarity(q_embed, WEIGHTS)[0] | |
| best = int(np.argmax(sims)) | |
| best_word = IDX2TOKEN.get(best, "<unknown>") | |
| return f"🗣️ Nearest concept: **{best_word}**" | |