Spaces:
Runtime error
Runtime error
Update Similarity.py
Browse files- Similarity.py +3 -47
Similarity.py
CHANGED
@@ -1,15 +1,8 @@
|
|
1 |
-
import
|
2 |
-
nltk.data.path.append("./nltk_data")
|
3 |
-
from sentence_transformers import SentenceTransformer, util
|
4 |
|
5 |
class Similarity:
|
6 |
def __init__(self):
|
7 |
self.model = None
|
8 |
-
# Download punkt tokenizer once, suppress if already present
|
9 |
-
try:
|
10 |
-
nltk.data.find('tokenizers/punkt')
|
11 |
-
except LookupError:
|
12 |
-
nltk.download('punkt', download_dir='./nltk_data')
|
13 |
|
14 |
def load_model(self):
|
15 |
if self.model is None:
|
@@ -17,43 +10,6 @@ class Similarity:
|
|
17 |
self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
|
18 |
print("Model loaded.")
|
19 |
|
20 |
-
def
|
21 |
-
sentences = nltk.sent_tokenize(text)
|
22 |
-
chunks = []
|
23 |
-
current_chunk = ""
|
24 |
-
for sentence in sentences:
|
25 |
-
if len(current_chunk) + len(sentence) <= chunk_size:
|
26 |
-
current_chunk += " " + sentence if current_chunk else sentence
|
27 |
-
else:
|
28 |
-
chunks.append(current_chunk)
|
29 |
-
# Start the next chunk with overlap
|
30 |
-
current_chunk = sentence[:overlap_size] + sentence[overlap_size:]
|
31 |
-
if current_chunk:
|
32 |
-
chunks.append(current_chunk)
|
33 |
-
return chunks
|
34 |
-
|
35 |
-
def get_sim_text(self, text, claim_embedding, min_threshold=0.4, chunk_size=1500):
|
36 |
self.load_model()
|
37 |
-
|
38 |
-
if not text:
|
39 |
-
return []
|
40 |
-
|
41 |
-
filtered_results = []
|
42 |
-
chunks = self.chunk_text(text, chunk_size)
|
43 |
-
if not chunks:
|
44 |
-
return []
|
45 |
-
|
46 |
-
chunk_embeddings = self.model.encode(
|
47 |
-
chunks, convert_to_tensor=True, show_progress_bar=False
|
48 |
-
)
|
49 |
-
chunk_similarities = util.cos_sim(claim_embedding, chunk_embeddings)
|
50 |
-
|
51 |
-
for chunk, similarity in zip(chunks, chunk_similarities[0]):
|
52 |
-
if similarity >= min_threshold:
|
53 |
-
print(chunk)
|
54 |
-
print()
|
55 |
-
print(similarity)
|
56 |
-
print("--------------------------------------------------")
|
57 |
-
filtered_results.append(chunk)
|
58 |
-
|
59 |
-
return filtered_results
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
|
|
|
|
2 |
|
3 |
class Similarity:
|
4 |
def __init__(self):
|
5 |
self.model = None
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def load_model(self):
|
8 |
if self.model is None:
|
|
|
10 |
self.model = SentenceTransformer("lighteternal/stsb-xlm-r-greek-transfer")
|
11 |
print("Model loaded.")
|
12 |
|
13 |
+
def embed_text(self, text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
self.load_model()
|
15 |
+
return self.model.encode(text, convert_to_tensor=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|