JohnKouf commited on
Commit
91e9e23
·
verified ·
1 Parent(s): 0115859

Update Similarity.py

Browse files
Files changed (1) hide show
  1. Similarity.py +3 -47
Similarity.py CHANGED
@@ -1,15 +1,8 @@
1
- import nltk
2
- nltk.data.path.append("./nltk_data")
3
- from sentence_transformers import SentenceTransformer, util
4
 
5
  class Similarity:
6
  def __init__(self):
7
  self.model = None
8
- # Download punkt tokenizer once, suppress if already present
9
- try:
10
- nltk.data.find('tokenizers/punkt')
11
- except LookupError:
12
- nltk.download('punkt', download_dir='./nltk_data')
13
 
14
  def load_model(self):
15
  if self.model is None:
@@ -17,43 +10,6 @@ class Similarity:
17
  self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
18
  print("Model loaded.")
19
 
20
- def chunk_text(self, text, chunk_size=1400, overlap_size=200):
21
- sentences = nltk.sent_tokenize(text)
22
- chunks = []
23
- current_chunk = ""
24
- for sentence in sentences:
25
- if len(current_chunk) + len(sentence) <= chunk_size:
26
- current_chunk += " " + sentence if current_chunk else sentence
27
- else:
28
- chunks.append(current_chunk)
29
- # Start the next chunk with overlap
30
- current_chunk = sentence[:overlap_size] + sentence[overlap_size:]
31
- if current_chunk:
32
- chunks.append(current_chunk)
33
- return chunks
34
-
35
- def get_sim_text(self, text, claim_embedding, min_threshold=0.4, chunk_size=1500):
36
  self.load_model()
37
-
38
- if not text:
39
- return []
40
-
41
- filtered_results = []
42
- chunks = self.chunk_text(text, chunk_size)
43
- if not chunks:
44
- return []
45
-
46
- chunk_embeddings = self.model.encode(
47
- chunks, convert_to_tensor=True, show_progress_bar=False
48
- )
49
- chunk_similarities = util.cos_sim(claim_embedding, chunk_embeddings)
50
-
51
- for chunk, similarity in zip(chunks, chunk_similarities[0]):
52
- if similarity >= min_threshold:
53
- print(chunk)
54
- print()
55
- print(similarity)
56
- print("--------------------------------------------------")
57
- filtered_results.append(chunk)
58
-
59
- return filtered_results
 
1
+ from sentence_transformers import SentenceTransformer
 
 
2
 
3
  class Similarity:
4
  def __init__(self):
5
  self.model = None
 
 
 
 
 
6
 
7
  def load_model(self):
8
  if self.model is None:
 
10
  self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
11
  print("Model loaded.")
12
 
13
+ def embed_text(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  self.load_model()
15
+ return self.model.encode(text, convert_to_tensor=True)