Molbap HF Staff commited on
Commit
c7100d5
·
1 Parent(s): ae4e744
Files changed (3) hide show
  1. app.py +17 -0
  2. build_cache.py +146 -7
  3. modular_graph_and_candidates.py +39 -9
app.py CHANGED
@@ -52,6 +52,17 @@ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimo
52
 
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
55
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
56
  return _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
57
 
@@ -96,6 +107,12 @@ with gr.Blocks(css=CUSTOM_CSS) as demo:
96
  timeline_json_out = gr.File(label="Download timeline.json")
97
 
98
  timeline_btn.click(lambda repo, thresh, multi: run_timeline(repo, thresh, multi, "jaccard"), [timeline_repo_in, timeline_thresh, timeline_multi_cb], [timeline_html_out, timeline_json_out])
 
 
 
 
 
 
99
 
100
  if __name__ == "__main__":
101
  demo.launch(allowed_paths=["static"])
 
52
 
53
 
54
 
55
+ def run_loc(sim_method: str, multimodal: bool):
56
+ latest_fp = hf_hub_download(repo_id=CACHE_REPO, filename="latest.json", repo_type="dataset")
57
+ info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
58
+ sha = info["sha"]
59
+ key = f"{sha}/{sim_method}-m{int(multimodal)}"
60
+ html_fp = hf_hub_download(repo_id=CACHE_REPO, filename=f"loc/{key}.html", repo_type="dataset")
61
+ raw_html = Path(html_fp).read_text(encoding="utf-8")
62
+ iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
63
+ return iframe_html
64
+
65
+
66
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
67
  return _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
68
 
 
107
  timeline_json_out = gr.File(label="Download timeline.json")
108
 
109
  timeline_btn.click(lambda repo, thresh, multi: run_timeline(repo, thresh, multi, "jaccard"), [timeline_repo_in, timeline_thresh, timeline_multi_cb], [timeline_html_out, timeline_json_out])
110
+ with gr.Tab("LOC Growth"):
111
+ sim_radio2 = gr.Radio(["jaccard","embedding"], value="jaccard", label="Similarity metric")
112
+ multi_cb2 = gr.Checkbox(label="Only multimodal models")
113
+ go_loc = gr.Button("Show LOC growth")
114
+ loc_html = gr.HTML(show_label=False)
115
+ go_loc.click(run_loc, [sim_radio2, multi_cb2], loc_html)
116
 
117
  if __name__ == "__main__":
118
  demo.launch(allowed_paths=["static"])
build_cache.py CHANGED
@@ -2,21 +2,152 @@
2
  import os
3
  import io
4
  import json
 
5
  import subprocess
6
  import tempfile
7
  from pathlib import Path
8
  from datetime import datetime, timezone
 
9
  from huggingface_hub import HfApi
 
10
  from modular_graph_and_candidates import (
11
- build_graph_json, generate_html,
12
- build_timeline_json, generate_timeline_html
 
 
13
  )
14
 
15
- REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
16
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
17
- MIN_THRESH = 0.1 # Minimum threshold for caching similarities
18
- MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
19
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def main():
22
  tmp = Path(tempfile.mkdtemp())
@@ -24,6 +155,10 @@ def main():
24
  sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
25
  repo_path = tmp / "repo"
26
 
 
 
 
 
27
  graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
28
  timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
29
  graph_html = generate_html(graph)
@@ -38,10 +173,12 @@ def main():
38
  "updated_utc": datetime.now(timezone.utc).isoformat(),
39
  "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
40
  "paths": {
41
- "graph_json": f"graph/{key}.json",
42
- "graph_html": f"graph/{key}.html",
43
  "timeline_json": f"timeline/{key}.json",
44
  "timeline_html": f"timeline/{key}.html",
 
 
45
  },
46
  }
47
 
@@ -58,6 +195,8 @@ def main():
58
  put(f"graph/{key}.html", graph_html)
59
  put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
60
  put(f"timeline/{key}.html", timeline_html)
 
 
61
  put("latest.json", json.dumps(latest, separators=(",", ":")))
62
 
63
  if __name__ == "__main__":
 
2
  import os
3
  import io
4
  import json
5
+ import tarfile
6
  import subprocess
7
  import tempfile
8
  from pathlib import Path
9
  from datetime import datetime, timezone
10
+
11
  from huggingface_hub import HfApi
12
+
13
  from modular_graph_and_candidates import (
14
+ build_graph_json,
15
+ generate_html,
16
+ build_timeline_json,
17
+ generate_timeline_html,
18
  )
19
 
20
+ REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
21
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
22
+ MIN_THRESH = 0.1
23
+ MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
24
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
25
+ MODULAR_CUTOFF_ISO = "2024-05-31"
26
+
27
+ def _run(cwd: Path, *args: str) -> str:
28
+ p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200)
29
+ if p.returncode != 0:
30
+ raise RuntimeError(p.stderr.strip()[:400])
31
+ return p.stdout
32
+
33
+ def _count_lines(text: str) -> int:
34
+ return text.count("\n") + (1 if text and not text.endswith("\n") else 0)
35
+
36
+ def _compute_loc_growth(repo: Path) -> dict:
37
+ try:
38
+ _run(repo, "fetch", "--unshallow", "--tags", "--prune")
39
+ except Exception:
40
+ _run(repo, "fetch", "--depth=100000", "--tags", "--prune")
41
+
42
+ pathspec = "src/transformers/models"
43
+ lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines()
44
+ commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln]
45
+ total = len(commits)
46
+ if total > 500:
47
+ step = max(1, total // 300)
48
+ commits = commits[::step]
49
+
50
+ out = []
51
+ for sha, date_iso in commits:
52
+ proc = subprocess.run(
53
+ ["git", "archive", sha, "--", pathspec],
54
+ cwd=repo, capture_output=True, timeout=180
55
+ )
56
+ if proc.returncode != 0 or not proc.stdout:
57
+ # Fallback: zero for this point; continue
58
+ out.append({
59
+ "sha": sha, "date": date_iso,
60
+ "loc_modeling_all": 0, "loc_modular": 0,
61
+ "loc_modeling_included": 0, "effective_loc": 0,
62
+ "n_models_with_modular": 0
63
+ })
64
+ continue
65
+
66
+ buf = io.BytesIO(proc.stdout)
67
+ modeling_by_model = {}
68
+ modular_by_model = {}
69
+
70
+ with tarfile.open(fileobj=buf, mode="r:*") as tar:
71
+ for m in tar.getmembers():
72
+ if not m.isfile():
73
+ continue
74
+ name = m.name
75
+ if not name.endswith(".py"):
76
+ continue
77
+ if "/models/" not in name:
78
+ continue
79
+ parts = name.split("/")
80
+ try:
81
+ idx = parts.index("models")
82
+ model = parts[idx + 1] if idx + 1 < len(parts) else ""
83
+ except ValueError:
84
+ model = ""
85
+ if not model:
86
+ continue
87
+ if "/modeling_" in name or "/modular_" in name:
88
+ f = tar.extractfile(m)
89
+ if not f:
90
+ continue
91
+ try:
92
+ txt = f.read().decode("utf-8", errors="ignore")
93
+ finally:
94
+ f.close()
95
+ n = _count_lines(txt)
96
+ if "/modular_" in name:
97
+ modular_by_model[model] = modular_by_model.get(model, 0) + n
98
+ elif "/modeling_" in name:
99
+ modeling_by_model[model] = modeling_by_model.get(model, 0) + n
100
+
101
+ modeling_all = sum(modeling_by_model.values())
102
+ modular_loc = sum(modular_by_model.values())
103
+ models_with_modular = set(modular_by_model.keys())
104
+ modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular)
105
+ modeling_included = modeling_all - modeling_excluded
106
+ effective = modeling_included + modular_loc
107
+
108
+ out.append({
109
+ "sha": sha,
110
+ "date": date_iso,
111
+ "loc_modeling_all": modeling_all,
112
+ "loc_modular": modular_loc,
113
+ "loc_modeling_included": modeling_included,
114
+ "effective_loc": effective,
115
+ "n_models_with_modular": len(models_with_modular),
116
+ })
117
+
118
+ return {"series": out, "cutoff": MODULAR_CUTOFF_ISO}
119
+
120
+ def _loc_html(loc: dict) -> str:
121
+ data = json.dumps(loc["series"], separators=(",", ":"))
122
+ cutoff = loc["cutoff"]
123
+ return f"""<!doctype html><meta charset=utf-8>
124
+ <title>LOC growth</title>
125
+ <div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div>
126
+ <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>
127
+ <script>
128
+ const raw={data};
129
+ const xs=raw.map(d=>new Date(d.date).getTime());
130
+ const eff=raw.map(d=>d.effective_loc);
131
+ const mod=raw.map(d=>d.loc_modular);
132
+ const mdl_all=raw.map(d=>d.loc_modeling_all);
133
+ const mdl_inc=raw.map(d=>d.loc_modeling_included);
134
+ const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime();
135
+ const opts={{
136
+ chart:{{type:"line",height:"100%"}},
137
+ series:[
138
+ {{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}},
139
+ {{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}},
140
+ {{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}},
141
+ {{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}}
142
+ ],
143
+ xaxis:{{type:"datetime"}},
144
+ yaxis:{{labels:{{formatter:v=>Math.round(v)}}}},
145
+ stroke:{{width:2}},
146
+ tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}},
147
+ annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}}
148
+ }};
149
+ new ApexCharts(document.getElementById("chart"),opts).render();
150
+ </script>"""
151
 
152
  def main():
153
  tmp = Path(tempfile.mkdtemp())
 
155
  sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
156
  repo_path = tmp / "repo"
157
 
158
+ loc_growth = _compute_loc_growth(repo_path)
159
+ loc_json_str = json.dumps(loc_growth, separators=(",", ":"))
160
+ loc_html_str = _loc_html(loc_growth)
161
+
162
  graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
163
  timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
164
  graph_html = generate_html(graph)
 
173
  "updated_utc": datetime.now(timezone.utc).isoformat(),
174
  "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
175
  "paths": {
176
+ "graph_json": f"graph/{key}.json",
177
+ "graph_html": f"graph/{key}.html",
178
  "timeline_json": f"timeline/{key}.json",
179
  "timeline_html": f"timeline/{key}.html",
180
+ "loc_json": f"loc/{key}.json",
181
+ "loc_html": f"loc/{key}.html",
182
  },
183
  }
184
 
 
195
  put(f"graph/{key}.html", graph_html)
196
  put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
197
  put(f"timeline/{key}.html", timeline_html)
198
+ put(f"loc/{key}.json", loc_json_str)
199
+ put(f"loc/{key}.html", loc_html_str)
200
  put("latest.json", json.dumps(latest, separators=(",", ":")))
201
 
202
  if __name__ == "__main__":
modular_graph_and_candidates.py CHANGED
@@ -681,7 +681,7 @@ svg{ width:100vw; height:100vh; }
681
  fill:var(--muted);
682
  pointer-events:none;
683
  text-anchor:middle;
684
- font-size:10px;
685
  paint-order:stroke fill;
686
  stroke:var(--bg);
687
  stroke-width:2px;
@@ -695,7 +695,7 @@ svg{ width:100vw; height:100vh; }
695
  position:fixed; top:18px; left:18px;
696
  background:rgba(255,255,255,.92);
697
  padding:18px 28px; border-radius:10px; border:1.5px solid #bbb;
698
- font-size:18px; box-shadow:0 2px 8px rgba(0,0,0,.08);
699
  }
700
  @media (prefers-color-scheme: dark){
701
  #legend{ background:rgba(20,22,25,.92); color:#e8e8e8; border-color:#444; }
@@ -740,7 +740,7 @@ node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
740
  node.append('text')
741
  .attr('class','node-label')
742
  .attr('dy','-2.4em')
743
- .style('font-size', d => d.cls === 'base' ? '50px' : '45px')
744
  .style('font-weight', d => d.cls === 'base' ? 'bold' : 'normal')
745
  .text(d => d.id);
746
 
@@ -837,7 +837,7 @@ svg{ width:100vw; height:100vh; }
837
  pointer-events:none;
838
  text-anchor:middle;
839
  font-weight:600;
840
- font-size:25px;
841
  paint-order:stroke fill;
842
  stroke:var(--outline);
843
  stroke-width:3px;
@@ -894,25 +894,39 @@ svg{ width:100vw; height:100vh; }
894
 
895
  .timeline-label {
896
  fill: var(--muted);
897
- font-size: 20px;
898
  font-weight: 600;
899
  text-anchor: middle;
900
  }
901
 
902
  .timeline-month-label {
903
  fill: var(--muted);
904
- font-size: 16px;
905
  font-weight: 400;
906
  text-anchor: middle;
907
  opacity: 0.7;
908
  }
909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
  /* Enhanced controls panel */
911
  #controls{
912
  position:fixed; top:20px; left:20px;
913
  background:rgba(255,255,255,.95);
914
  padding:20px 26px; border-radius:12px; border:1.5px solid #e0e0e0;
915
- font-size:14px; box-shadow:0 4px 16px rgba(0,0,0,.12);
916
  z-index: 100;
917
  backdrop-filter: blur(8px);
918
  max-width: 280px;
@@ -1021,6 +1035,22 @@ if (timeExtent[0] && timeExtent[1]) {
1021
  .attr('x', d => timeScale(d))
1022
  .attr('y', timelineY + 45)
1023
  .text(d => d.toLocaleDateString('en', { month: 'short' }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  }
1025
 
1026
  function handleZoom(event) {
@@ -1082,7 +1112,7 @@ if (timeScale) {
1082
  }
1083
  // Place undated models at the end
1084
  return timeScale.range()[1] + 100;
1085
- }).strength(0.95));
1086
  }
1087
 
1088
  // Organized Y positioning using lanes instead of random spread
@@ -1161,7 +1191,7 @@ TIMELINE_HTML = """
1161
  <div style='font-weight:600; margin-bottom:8px;'>Chronological Timeline</div>
1162
  🟡 base<br>🔵 modular<br>🔴 candidate<br>
1163
  <label><input type="checkbox" id="toggleRed" checked> Show candidates</label>
1164
- <div style='margin-top:10px; font-size:11px; color:var(--muted);'>
1165
  Models positioned by creation date<br>
1166
  Scroll & zoom to explore timeline
1167
  </div>
 
681
  fill:var(--muted);
682
  pointer-events:none;
683
  text-anchor:middle;
684
+ font-size:12px;
685
  paint-order:stroke fill;
686
  stroke:var(--bg);
687
  stroke-width:2px;
 
695
  position:fixed; top:18px; left:18px;
696
  background:rgba(255,255,255,.92);
697
  padding:18px 28px; border-radius:10px; border:1.5px solid #bbb;
698
+ font-size:22px; box-shadow:0 2px 8px rgba(0,0,0,.08);
699
  }
700
  @media (prefers-color-scheme: dark){
701
  #legend{ background:rgba(20,22,25,.92); color:#e8e8e8; border-color:#444; }
 
740
  node.append('text')
741
  .attr('class','node-label')
742
  .attr('dy','-2.4em')
743
+ .style('font-size', d => d.cls === 'base' ? '160px' : '120px')
744
  .style('font-weight', d => d.cls === 'base' ? 'bold' : 'normal')
745
  .text(d => d.id);
746
 
 
837
  pointer-events:none;
838
  text-anchor:middle;
839
  font-weight:600;
840
+ font-size:30px;
841
  paint-order:stroke fill;
842
  stroke:var(--outline);
843
  stroke-width:3px;
 
894
 
895
  .timeline-label {
896
  fill: var(--muted);
897
+ font-size: 50px;
898
  font-weight: 600;
899
  text-anchor: middle;
900
  }
901
 
902
  .timeline-month-label {
903
  fill: var(--muted);
904
+ font-size: 40px;
905
  font-weight: 400;
906
  text-anchor: middle;
907
  opacity: 0.7;
908
  }
909
 
910
+ .modular-milestone {
911
+ stroke: #ff6b35;
912
+ stroke-width: 3px;
913
+ stroke-opacity: 0.8;
914
+ stroke-dasharray: 5,5;
915
+ }
916
+
917
+ .modular-milestone-label {
918
+ fill: #ff6b35;
919
+ font-size: 35px;
920
+ font-weight: 600;
921
+ text-anchor: middle;
922
+ }
923
+
924
  /* Enhanced controls panel */
925
  #controls{
926
  position:fixed; top:20px; left:20px;
927
  background:rgba(255,255,255,.95);
928
  padding:20px 26px; border-radius:12px; border:1.5px solid #e0e0e0;
929
+ font-size:17px; box-shadow:0 4px 16px rgba(0,0,0,.12);
930
  z-index: 100;
931
  backdrop-filter: blur(8px);
932
  max-width: 280px;
 
1035
  .attr('x', d => timeScale(d))
1036
  .attr('y', timelineY + 45)
1037
  .text(d => d.toLocaleDateString('en', { month: 'short' }));
1038
+
1039
+ // Modular logic milestone marker - May 31, 2024
1040
+ const modularDate = new Date(2024, 4, 31);
1041
+ timelineG.append('line')
1042
+ .attr('class', 'modular-milestone')
1043
+ .attr('x1', timeScale(modularDate))
1044
+ .attr('y1', MARGIN.top)
1045
+ .attr('x2', timeScale(modularDate))
1046
+ .attr('y2', H - MARGIN.bottom);
1047
+
1048
+ timelineG.append('text')
1049
+ .attr('class', 'modular-milestone-label')
1050
+ .attr('x', timeScale(modularDate))
1051
+ .attr('y', MARGIN.top - 10)
1052
+ .attr('text-anchor', 'middle')
1053
+ .text('Modular Logic Added');
1054
  }
1055
 
1056
  function handleZoom(event) {
 
1112
  }
1113
  // Place undated models at the end
1114
  return timeScale.range()[1] + 100;
1115
+ }).strength(0.75));
1116
  }
1117
 
1118
  // Organized Y positioning using lanes instead of random spread
 
1191
  <div style='font-weight:600; margin-bottom:8px;'>Chronological Timeline</div>
1192
  🟡 base<br>🔵 modular<br>🔴 candidate<br>
1193
  <label><input type="checkbox" id="toggleRed" checked> Show candidates</label>
1194
+ <div style='margin-top:10px; font-size:13px; color:var(--muted);'>
1195
  Models positioned by creation date<br>
1196
  Scroll & zoom to explore timeline
1197
  </div>