JohnKouf commited on
Commit
a20a987
·
verified ·
1 Parent(s): ebe6af8

Update Similarity.py

Browse files
Files changed (1) hide show
  1. Similarity.py +15 -3
Similarity.py CHANGED
@@ -1,11 +1,21 @@
1
  import nltk
2
  nltk.data.path.append("./nltk_data")
3
  from sentence_transformers import SentenceTransformer, util
 
4
  class Similarity:
5
  def __init__(self):
6
- self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
7
- # Make sure nltk punkt tokenizer is downloaded
8
- #nltk.download('punkt')
 
 
 
 
 
 
 
 
 
9
 
10
  def chunk_text(self, text, chunk_size=1400, overlap_size=200):
11
  sentences = nltk.sent_tokenize(text)
@@ -23,6 +33,8 @@ class Similarity:
23
  return chunks
24
 
25
  def get_sim_text(self, text, claim_embedding, min_threshold=0.4, chunk_size=1500):
 
 
26
  if not text:
27
  return []
28
 
 
1
  import nltk
2
  nltk.data.path.append("./nltk_data")
3
  from sentence_transformers import SentenceTransformer, util
4
+
5
  class Similarity:
6
  def __init__(self):
7
+ self.model = None
8
+ # Download punkt tokenizer once, suppress if already present
9
+ try:
10
+ nltk.data.find('tokenizers/punkt')
11
+ except LookupError:
12
+ nltk.download('punkt', download_dir='./nltk_data')
13
+
14
+ def load_model(self):
15
+ if self.model is None:
16
+ print("Loading SentenceTransformer model...")
17
+ self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
18
+ print("Model loaded.")
19
 
20
  def chunk_text(self, text, chunk_size=1400, overlap_size=200):
21
  sentences = nltk.sent_tokenize(text)
 
33
  return chunks
34
 
35
  def get_sim_text(self, text, claim_embedding, min_threshold=0.4, chunk_size=1500):
36
+ self.load_model()
37
+
38
  if not text:
39
  return []
40