Spaces:
Runtime error
Runtime error
Commit
·
4ee0173
1
Parent(s):
5c906aa
update "how to train at 100mbps"
Browse files- app.py +44 -8
- st_helpers.py +2 -7
- static/content_style.css +4 -1
app.py
CHANGED
@@ -19,11 +19,11 @@ make_header()
|
|
19 |
content_text(f"""
|
20 |
There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
|
21 |
The first convolutional neural net to beat ImageNet
|
22 |
-
(
|
23 |
was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
|
24 |
-
(
|
25 |
takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
|
26 |
-
|
27 |
with 8x A100 would take decades.""")
|
28 |
|
29 |
content_text(f"""
|
@@ -34,12 +34,49 @@ All it takes is for a bunch of us to come together. In fact, we're doing it righ
|
|
34 |
draw_current_progress()
|
35 |
|
36 |
content_text(f"""
|
37 |
-
We're training a model similar to
|
38 |
that is, a transformer "language model" that generates images from text description.
|
39 |
-
It is trained on
|
40 |
the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
|
41 |
-
the
|
42 |
-
by
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
content_title("How do I join?")
|
@@ -71,5 +108,4 @@ content_text("<b> TODO </b> General Story That Weaves Together Three Tabs Below
|
|
71 |
|
72 |
make_tabs()
|
73 |
|
74 |
-
content_text("<b> TODO UPDATE")
|
75 |
make_footer()
|
|
|
19 |
content_text(f"""
|
20 |
There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
|
21 |
The first convolutional neural net to beat ImageNet
|
22 |
+
({cite("AlexNet", "https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf")})
|
23 |
was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
|
24 |
+
({cite("CoAtNet", "https://arxiv.org/abs/2106.04803")})
|
25 |
takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
|
26 |
+
{cite("GPT‑3", "https://arxiv.org/abs/2005.14165")} on a top-tier server
|
27 |
with 8x A100 would take decades.""")
|
28 |
|
29 |
content_text(f"""
|
|
|
34 |
draw_current_progress()
|
35 |
|
36 |
content_text(f"""
|
37 |
+
We're training a model similar to {cite("OpenAI DALL-E", "https://openai.com/blog/dall-e/")},
|
38 |
that is, a transformer "language model" that generates images from text description.
|
39 |
+
It is trained on {cite("LAION-400M", "https://laion.ai/laion-400-open-dataset/")},
|
40 |
the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
|
41 |
+
the {cite("dalle‑pytorch", "https://github.com/lucidrains/DALLE-pytorch")} implementation
|
42 |
+
by {cite("Phil Wang", "https://github.com/lucidrains")} with a few tweaks to make it communication-efficient.
|
43 |
+
""", vspace_after=8)
|
44 |
+
|
45 |
+
|
46 |
+
with st.expander("How to train efficiently over the internet?"):
|
47 |
+
content_text(f"""
|
48 |
+
Modern distributed training algorithms are designed for HPC networks with 10-100 gigabit per second bandwidth.
|
49 |
+
In turn, a typical Internet connection runs at 10-100 megabits per second: that’s three orders of magnitude slower.
|
50 |
+
To make distributed training over the Internet efficient, you need to win back these three orders of magnitude.
|
51 |
+
""")
|
52 |
+
content_text(f"""
|
53 |
+
This may seem daunting at first, but in reality, DL researchers have already made all the necessary pieces for solving this puzzle:
|
54 |
+
<table style="border: 0px;"><tbody style="border: 0px;">
|
55 |
+
<tr><td> Speed-up (AllReduce)<br> </td> <td>Existing technique</td></tr>
|
56 |
+
<tr><td class=centered><strong>4-16x</strong></td><td>
|
57 |
+
<strong>Large-batch training:</strong> {cite("You et al. (2019)", "https://arxiv.org/abs/1904.00962")} proposed a way for training neural networks efficiently with larger batches, and hence, fewer communication rounds.
|
58 |
+
</td></tr>
|
59 |
+
<tr><td class=centered><strong>4-64x</strong></td><td>
|
60 |
+
<strong>Gradient Compression:</strong> from simple {cite("8-bit quantization", "https://arxiv.org/abs/1511.04561")}
|
61 |
+
to advanced techniques such as {cite("Deep Gradient Compression", "https://arxiv.org/abs/1712.01887")},
|
62 |
+
{cite("PowerSGD", "https://arxiv.org/abs/1905.13727")}, {cite("1-bit Adam", "https://arxiv.org/abs/2102.02888")},
|
63 |
+
and many others. As a rule of thumb, you can safely reduce communication by 16-64x. More extreme compression is often
|
64 |
+
possible, but it may affect stability or final quality.
|
65 |
+
</td></tr>
|
66 |
+
<tr><td class=centered><strong>4-24x</strong></td><td>
|
67 |
+
<strong>Parameter sharing:</strong> reusing parameters between model layers results in a model with fewer parameters,
|
68 |
+
and hence, fewer gradients to communicate. {cite("Lan et al. (2019)", "https://arxiv.org/abs/1909.11942")} and
|
69 |
+
{cite("Xue et al. (2021)", "https://arxiv.org/pdf/2107.11817.pdf")} propose efficient parameter sharing techniques
|
70 |
+
for NLP and vision.
|
71 |
+
</td></tr>
|
72 |
+
<tr><td class=centered><strong>1.5-2x</strong></td><td>
|
73 |
+
<strong>Overlapping computation with communication:</strong> running network communication in background while
|
74 |
+
computing the next portion of gradients. This is a {cite("long-standing trick from HPC", "https://ur.booksc.eu/book/1624068/2d0506")}
|
75 |
+
that was recently adapted for DL training. {cite("Ren et al. (2021)", "https://arxiv.org/abs/2101.06840")} show that
|
76 |
+
updating parameters in background while computing the next batch of gradients does not reduce convergence.
|
77 |
+
</td></tr>
|
78 |
+
</tbody></table>
|
79 |
+
""")
|
80 |
|
81 |
|
82 |
content_title("How do I join?")
|
|
|
108 |
|
109 |
make_tabs()
|
110 |
|
|
|
111 |
make_footer()
|
st_helpers.py
CHANGED
@@ -50,10 +50,5 @@ def content_text(text: str, vspace_before: int = 0, vspace_after: int = 0):
|
|
50 |
f'{text}</div><center>',
|
51 |
unsafe_allow_html=True)
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
def cite(tag):
|
58 |
-
CITATIONS.setdefault(tag, len(CITATIONS) + 1)
|
59 |
-
return f" [{CITATIONS[tag]}]"
|
|
|
50 |
f'{text}</div><center>',
|
51 |
unsafe_allow_html=True)
|
52 |
|
53 |
+
def cite(tag, link):
|
54 |
+
return f"""<a target="_blank" rel="noopener noreferrer" href="{link}">{tag}</a>"""
|
|
|
|
|
|
|
|
|
|
static/content_style.css
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
.faded {
|
2 |
margin: 0 auto;
|
3 |
background: var(--window-color);
|
4 |
-
box-shadow: 0 0
|
5 |
font-family: cursive;
|
6 |
font-family: "Gill Sans", sans-serif;
|
7 |
display: inline-block
|
8 |
}
|
|
|
|
|
|
|
9 |
.padded {
|
10 |
width: 100%;
|
11 |
max-width: 800px;
|
|
|
1 |
.faded {
|
2 |
margin: 0 auto;
|
3 |
background: var(--window-color);
|
4 |
+
box-shadow: 0 0 1px 1px var(--window-color);
|
5 |
font-family: cursive;
|
6 |
font-family: "Gill Sans", sans-serif;
|
7 |
display: inline-block
|
8 |
}
|
9 |
+
.centered {
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
.padded {
|
13 |
width: 100%;
|
14 |
max-width: 800px;
|