import pickle from config import ( MODEL_NAME, SETENCE_EMBEDDING_FILE, SETENCE_SIMILARITY_FILE, SAMPLE_DATA_FILE, SUBJECT_DATA_FILE ) from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper from data_lib.subject_data import SubjectData from data_lib.sample_name_data import SampleNameData from clustering_lib.sentence_clustering_lib import SentenceClusteringLib from data_lib.base_data import COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME class SentenceTransformerService: def __init__(self): self.sentenceTransformerHelper = None self.dic_standard_subject = None self.sample_name_sentence_embeddings = None self.sample_name_sentence_similarities = None self.sampleData = None self.sentence_clustering_lib = None self.name_groups = None def load_model_data(self): """Load model and data only once at startup""" if self.sentenceTransformerHelper is not None: print("Model already loaded. Skipping reload.") return # Không load lại nếu đã có model print("Loading models and data...") # Load sentence transformer model self.sentenceTransformerHelper = SentenceTransformerHelper( convert_to_zenkaku_flag=True, replace_words=None, keywords=None ) self.sentenceTransformerHelper.load_model_by_name(MODEL_NAME) # Load standard subject dictionary self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE) # Load pre-computed embeddings and similarities with open(SETENCE_EMBEDDING_FILE, "rb") as f: self.sample_name_sentence_embeddings = pickle.load(f) with open(SETENCE_SIMILARITY_FILE, "rb") as f: self.sample_name_sentence_similarities = pickle.load(f) # Load and process sample data self.sampleData = SampleNameData() self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE) self.sampleData.process_data() # Create sentence clusters self.sentence_clustering_lib = SentenceClusteringLib(self.sample_name_sentence_embeddings) best_name_eps = 0.07 self.name_groups, _ = self.sentence_clustering_lib.create_sentence_cluster(best_name_eps) self.sampleData._create_key_column( COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME ) self.sampleData.set_name_sentence_labels(self.name_groups) self.sampleData.build_search_tree() print("Models and data loaded successfully") # Global instance (singleton) sentence_transformer_service = SentenceTransformerService()