Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

729a063

1 Parent(s): 3a743e7

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -5

app.py CHANGED Viewed

@@ -76,13 +76,13 @@ st.caption("Multi-Head Attention")
 mha_flop = 2*bs*h*n*(d/h)
 mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
-att_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
 st.caption("Multi-Query Attention")
 mqa_flop = 2*bs*h*n*(d/h)
 mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
-att_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
 st.subheader('Output projection')
 out_flop = 2*bs*1*d*d
@@ -91,15 +91,17 @@ c1, c2 = st.columns([2, 3])
 out_time = print_kernel_execution(c1, c2, out_flop, out_bytes)
 st.subheader('Element-wise ops')
-st.write("We also need to take into the softmax layer and layer norm")
 st.caption("Softmax")
 softmax_bytes = 2*bs*h*n + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 softmax_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
-st.caption("Layer norm")
 st.header('MLP')
 st.subheader('First Linear')
@@ -113,3 +115,17 @@ mlp2_flop = 2*bs*1*d*4*d
 mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
 c1, c2 = st.columns([2, 3])
 mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)

 mha_flop = 2*bs*h*n*(d/h)
 mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
+att2_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
 st.caption("Multi-Query Attention")
 mqa_flop = 2*bs*h*n*(d/h)
 mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
+att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
 st.subheader('Output projection')
 out_flop = 2*bs*1*d*d
 out_time = print_kernel_execution(c1, c2, out_flop, out_bytes)
 st.subheader('Element-wise ops')
+st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
 st.caption("Softmax")
 softmax_bytes = 2*bs*h*n + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 softmax_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
+st.caption("Layer norm/residual connection")
+ln_bytes = 2*bs*1*d
+ln_flop = 0
+ln_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
 st.header('MLP')
 st.subheader('First Linear')
 mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
 c1, c2 = st.columns([2, 3])
 mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)
+st.subheader('Element-wise ops')
+st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
+ln_bytes = 2*bs*1*d
+ln_flop = 0
+ln_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
+st.header("Adding it all up")
+shared_time = out_time + softmax_time + 2*ln_time + mlp1_time + mlp2_time + 3*ln_time
+mha_total_time = qkv_mha_time + att1_mha_time + att2_mha_time + shared_time
+mqa_total_time = qkv_mqa_time + att1_mqa_time + att2_mqa_time + shared_time
+st.write("MHA exec time (ms): " + str(mha_total_time))
+st.write("MQA exec time (ms): " + str(mqa_total_time))