Merge branch 'main' of https://huggingface.co/spaces/Molbap/transformers-modular-refactor
Browse files
modular_graph_and_candidates.py
CHANGED
|
@@ -94,59 +94,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
| 94 |
out[(m1, m2)] = s
|
| 95 |
return out
|
| 96 |
|
| 97 |
-
#@spaces.GPU
|
| 98 |
-
def old_embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 100 |
-
model.max_seq_length = 8192 # truncate overly long modeling files
|
| 101 |
-
texts = {}
|
| 102 |
-
|
| 103 |
-
for name in tqdm(missing, desc="Reading modeling files"):
|
| 104 |
-
code = ""
|
| 105 |
-
for py in (models_root / name).rglob("modeling_*.py"):
|
| 106 |
-
try:
|
| 107 |
-
code += _strip_source(py.read_text(encoding="utf-8")) + "\n"
|
| 108 |
-
except Exception:
|
| 109 |
-
continue
|
| 110 |
-
texts[name] = code.strip() or " "
|
| 111 |
-
|
| 112 |
-
names = list(texts)
|
| 113 |
-
all_embeddings = []
|
| 114 |
-
|
| 115 |
-
print("Encoding embeddings...")
|
| 116 |
-
batch_size = 2
|
| 117 |
-
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 118 |
-
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 119 |
-
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 120 |
-
all_embeddings.append(emb)
|
| 121 |
-
|
| 122 |
-
embeddings = np.vstack(all_embeddings) # [N, D]
|
| 123 |
-
|
| 124 |
-
print("Computing pairwise similarities...")
|
| 125 |
-
sims = embeddings @ embeddings.T
|
| 126 |
-
|
| 127 |
-
out = {}
|
| 128 |
-
for i in range(len(names)):
|
| 129 |
-
for j in range(i + 1, len(names)):
|
| 130 |
-
s = sims[i, j]
|
| 131 |
-
if s >= thr:
|
| 132 |
-
out[(names[i], names[j])] = float(s)
|
| 133 |
-
return out
|
| 134 |
|
| 135 |
-
|
| 136 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 137 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 138 |
|
| 139 |
-
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
| 140 |
try:
|
| 141 |
cfg = model[0].auto_model.config
|
| 142 |
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
| 143 |
except Exception:
|
| 144 |
-
pos_limit = 1024
|
| 145 |
|
| 146 |
-
seq_len = min(pos_limit, 2048)
|
| 147 |
-
model.max_seq_length = seq_len
|
| 148 |
-
model[0].max_seq_length = seq_len
|
| 149 |
-
model[0].tokenizer.model_max_length = seq_len
|
| 150 |
|
| 151 |
texts = {}
|
| 152 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
@@ -168,8 +130,6 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 168 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 169 |
all_embeddings.append(emb)
|
| 170 |
|
| 171 |
-
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
| 172 |
-
import numpy as np
|
| 173 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 174 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
| 175 |
embeddings = embeddings / norms
|
|
|
|
| 94 |
out[(m1, m2)] = s
|
| 95 |
return out
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
@spaces.GPU
|
| 99 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 100 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 101 |
|
|
|
|
| 102 |
try:
|
| 103 |
cfg = model[0].auto_model.config
|
| 104 |
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
| 105 |
except Exception:
|
| 106 |
+
pos_limit = 1024
|
| 107 |
|
| 108 |
+
seq_len = min(pos_limit, 2048)
|
| 109 |
+
model.max_seq_length = seq_len
|
| 110 |
+
model[0].max_seq_length = seq_len
|
| 111 |
+
model[0].tokenizer.model_max_length = seq_len
|
| 112 |
|
| 113 |
texts = {}
|
| 114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
|
| 130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 131 |
all_embeddings.append(emb)
|
| 132 |
|
|
|
|
|
|
|
| 133 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 134 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
| 135 |
embeddings = embeddings / norms
|