attempt checkpointing
Browse files- modular_graph_and_candidates.py +52 -22
    	
        modular_graph_and_candidates.py
    CHANGED
    
    | @@ -95,7 +95,6 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup | |
| 95 | 
             
                        out[(m1, m2)] = s
         | 
| 96 | 
             
                return out
         | 
| 97 |  | 
| 98 | 
            -
             | 
| 99 | 
             
            @spaces.GPU
         | 
| 100 | 
             
            def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
         | 
| 101 | 
             
                model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
         | 
| @@ -113,11 +112,10 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl | |
| 113 |  | 
| 114 | 
             
                texts = {}
         | 
| 115 | 
             
                for name in tqdm(missing, desc="Reading modeling files"):
         | 
| 116 | 
            -
                    # Skip models that cause GPU task aborts
         | 
| 117 | 
             
                    if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
         | 
| 118 | 
             
                        print(f"Skipping {name} (causes GPU abort)")
         | 
| 119 | 
             
                        continue
         | 
| 120 | 
            -
             | 
| 121 | 
             
                    code = ""
         | 
| 122 | 
             
                    for py in (models_root / name).rglob("modeling_*.py"):
         | 
| 123 | 
             
                        try:
         | 
| @@ -130,29 +128,54 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl | |
| 130 | 
             
                all_embeddings = []
         | 
| 131 |  | 
| 132 | 
             
                print(f"Encoding embeddings for {len(names)} models...")
         | 
| 133 | 
            -
                batch_size = 4  #  | 
| 134 | 
            -
             | 
| 135 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 136 | 
             
                    batch_names = names[i:i+batch_size]
         | 
| 137 | 
             
                    batch_texts = [texts[name] for name in batch_names]
         | 
| 138 | 
            -
             | 
| 139 | 
             
                    try:
         | 
| 140 | 
             
                        print(f"Processing batch: {batch_names}")
         | 
| 141 | 
             
                        emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
         | 
| 142 | 
            -
                        all_embeddings.append(emb)
         | 
| 143 | 
            -
                        print(f"β Completed batch of {len(batch_names)} models")
         | 
| 144 | 
            -
                        
         | 
| 145 | 
            -
                        # Clear GPU cache every 3 batches to prevent memory accumulation
         | 
| 146 | 
            -
                        if i % (3 * batch_size) == 0 and torch.cuda.is_available():
         | 
| 147 | 
            -
                            torch.cuda.empty_cache()
         | 
| 148 | 
            -
                            torch.cuda.synchronize()  # Force GPU sync
         | 
| 149 | 
            -
                            print(f"π§Ή Cleared GPU cache after batch {i//batch_size + 1}")
         | 
| 150 | 
            -
                            
         | 
| 151 | 
             
                    except Exception as e:
         | 
| 152 | 
             
                        print(f"β οΈ  GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
         | 
| 153 | 
            -
                         | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 156 |  | 
| 157 | 
             
                embeddings = np.vstack(all_embeddings).astype(np.float32)
         | 
| 158 | 
             
                norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
         | 
| @@ -162,19 +185,26 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl | |
| 162 | 
             
                sims_mat = embeddings @ embeddings.T
         | 
| 163 |  | 
| 164 | 
             
                out = {}
         | 
| 165 | 
            -
                matrix_size = embeddings.shape[0] | 
| 166 | 
            -
                processed_names = names[:matrix_size] | 
| 167 | 
            -
                
         | 
| 168 | 
             
                for i in range(matrix_size):
         | 
| 169 | 
             
                    for j in range(i + 1, matrix_size):
         | 
| 170 | 
             
                        s = float(sims_mat[i, j])
         | 
| 171 | 
             
                        if s >= thr:
         | 
| 172 | 
             
                            out[(processed_names[i], processed_names[j])] = s
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 173 | 
             
                return out
         | 
| 174 |  | 
| 175 |  | 
| 176 |  | 
| 177 |  | 
|  | |
| 178 | 
             
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 179 | 
             
            # 2)  Scan *modular_*.py* files to build an importβdependency graph
         | 
| 180 | 
             
            #     β only **modeling_*** imports are considered (skip configuration / processing)
         | 
|  | |
| 95 | 
             
                        out[(m1, m2)] = s
         | 
| 96 | 
             
                return out
         | 
| 97 |  | 
|  | |
| 98 | 
             
            @spaces.GPU
         | 
| 99 | 
             
            def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
         | 
| 100 | 
             
                model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
         | 
|  | |
| 112 |  | 
| 113 | 
             
                texts = {}
         | 
| 114 | 
             
                for name in tqdm(missing, desc="Reading modeling files"):
         | 
|  | |
| 115 | 
             
                    if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
         | 
| 116 | 
             
                        print(f"Skipping {name} (causes GPU abort)")
         | 
| 117 | 
             
                        continue
         | 
| 118 | 
            +
             | 
| 119 | 
             
                    code = ""
         | 
| 120 | 
             
                    for py in (models_root / name).rglob("modeling_*.py"):
         | 
| 121 | 
             
                        try:
         | 
|  | |
| 128 | 
             
                all_embeddings = []
         | 
| 129 |  | 
| 130 | 
             
                print(f"Encoding embeddings for {len(names)} models...")
         | 
| 131 | 
            +
                batch_size = 4  # keep your default
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                # ββ checkpoint / resume ββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 134 | 
            +
                ckpt_path = models_root / "__emb_ckpt.npz"
         | 
| 135 | 
            +
                start_idx = 0
         | 
| 136 | 
            +
                emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                if ckpt_path.exists():
         | 
| 139 | 
            +
                    try:
         | 
| 140 | 
            +
                        ckpt = np.load(ckpt_path, allow_pickle=True)
         | 
| 141 | 
            +
                        ckpt_names = list(ckpt["names"])
         | 
| 142 | 
            +
                        if names[:len(ckpt_names)] == ckpt_names:
         | 
| 143 | 
            +
                            loaded = ckpt["embeddings"].astype(np.float32)
         | 
| 144 | 
            +
                            all_embeddings.append(loaded)
         | 
| 145 | 
            +
                            start_idx = len(ckpt_names)
         | 
| 146 | 
            +
                            print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
         | 
| 147 | 
            +
                    except Exception as e:
         | 
| 148 | 
            +
                        print(f"β οΈ  Failed to load checkpoint: {type(e).__name__}: {e}")
         | 
| 149 | 
            +
                # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
         | 
| 152 | 
             
                    batch_names = names[i:i+batch_size]
         | 
| 153 | 
             
                    batch_texts = [texts[name] for name in batch_names]
         | 
| 154 | 
            +
             | 
| 155 | 
             
                    try:
         | 
| 156 | 
             
                        print(f"Processing batch: {batch_names}")
         | 
| 157 | 
             
                        emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 158 | 
             
                    except Exception as e:
         | 
| 159 | 
             
                        print(f"β οΈ  GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
         | 
| 160 | 
            +
                        emb = np.zeros((len(batch_names), emb_dim), dtype=np.float32)
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    all_embeddings.append(emb)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                    # save checkpoint after each batch
         | 
| 165 | 
            +
                    try:
         | 
| 166 | 
            +
                        cur = np.vstack(all_embeddings).astype(np.float32)
         | 
| 167 | 
            +
                        np.savez(
         | 
| 168 | 
            +
                            ckpt_path,
         | 
| 169 | 
            +
                            embeddings=cur,
         | 
| 170 | 
            +
                            names=np.array(names[:i+len(batch_names)], dtype=object),
         | 
| 171 | 
            +
                        )
         | 
| 172 | 
            +
                    except Exception as e:
         | 
| 173 | 
            +
                        print(f"β οΈ  Failed to write checkpoint: {type(e).__name__}: {e}")
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
         | 
| 176 | 
            +
                        torch.cuda.empty_cache()
         | 
| 177 | 
            +
                        torch.cuda.synchronize()
         | 
| 178 | 
            +
                        print(f"π§Ή Cleared GPU cache after batch {(i - start_idx)//batch_size + 1}")
         | 
| 179 |  | 
| 180 | 
             
                embeddings = np.vstack(all_embeddings).astype(np.float32)
         | 
| 181 | 
             
                norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
         | 
|  | |
| 185 | 
             
                sims_mat = embeddings @ embeddings.T
         | 
| 186 |  | 
| 187 | 
             
                out = {}
         | 
| 188 | 
            +
                matrix_size = embeddings.shape[0]
         | 
| 189 | 
            +
                processed_names = names[:matrix_size]
         | 
|  | |
| 190 | 
             
                for i in range(matrix_size):
         | 
| 191 | 
             
                    for j in range(i + 1, matrix_size):
         | 
| 192 | 
             
                        s = float(sims_mat[i, j])
         | 
| 193 | 
             
                        if s >= thr:
         | 
| 194 | 
             
                            out[(processed_names[i], processed_names[j])] = s
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                # best-effort cleanup
         | 
| 197 | 
            +
                try:
         | 
| 198 | 
            +
                    ckpt_path.unlink()
         | 
| 199 | 
            +
                except Exception:
         | 
| 200 | 
            +
                    pass
         | 
| 201 | 
            +
             | 
| 202 | 
             
                return out
         | 
| 203 |  | 
| 204 |  | 
| 205 |  | 
| 206 |  | 
| 207 | 
            +
             | 
| 208 | 
             
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 209 | 
             
            # 2)  Scan *modular_*.py* files to build an importβdependency graph
         | 
| 210 | 
             
            #     β only **modeling_*** imports are considered (skip configuration / processing)
         | 
