ai-lab / cbow_logic.py
ClemSummer's picture
moved both cbow and qwen to cache HF dataset
e497915
# cbow_logic.py
import gensim
import os
import argparse
from typing import List, Tuple
import shlex
class MeaningCalculator:
def __init__(self, model_path: str = "/models/cbow/cbow_model.kv"):
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model not found at: {model_path}")
self.model = gensim.models.KeyedVectors.load(model_path, mmap='r')
def evaluate_expression(self, expression: str, topn: int = 10) -> List[Tuple[str, float]]:
# Evaluate expressions like '"new york" - city + capital'.
tokens = shlex.split(expression) # Handles quoted terms properly
positive = []
negative = []
current_op = "+"
for token in tokens:
print(token)
if token in ["+", "-"]:
current_op = token
else:
if current_op == "+":
positive.append(token)
else:
negative.append(token)
try:
return self.model.most_similar(positive=positive, negative=negative, topn=topn)
except KeyError as e:
return [("InputError", 0.0)]
from gensim.models import KeyedVectors
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate word vector expressions using CBOW.")
parser.add_argument("expression", type=str, help="Expression like 'king - man + woman'")
parser.add_argument("--model_path", type=str, default="./models/cbow_model.kv", help="Path to CBOW model")
args = parser.parse_args()
calc = MeaningCalculator(model_path=args.model_path)
results = calc.evaluate_expression(args.expression)
print(f"\nExpression: {args.expression}\nTop Results:")
for word, score in results:
print(f" {word:<15} {score:.4f}")