Spaces:

radames
/

Gradio-llama2.mojo

Runtime error

App Files Files Community

radames commited on Sep 13, 2023

Commit

c7eecb3

1 Parent(s): 0c1c7c9

new

Browse files

Files changed (12) hide show

.gitattributes +1 -0
Dockerfile +70 -0
LICENSE +21 -0
assets/llama2.mojo-demo.gif +3 -0
gradio_app.py +36 -0
llama2.mojo +747 -0
read/__init__.mojo +195 -0
read/libc/__init__.mojo +0 -0
read/libc/dirent/__init__.mojo +56 -0
read/libc/stdio/__init__.mojo +85 -0
read/libc/string/__init__.mojo +5 -0
tokenizer.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# https://github.com/modularml/mojo/blob/main/examples/docker/Dockerfile.mojosdk
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2023, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+# Example command line:
+# Use no-cache to force docker to rebuild layers of the image by downloading the SDK from the repos
+# docker build --no-cache \
+#    --build-arg AUTH_KEY=<your-modular-auth-key>
+#    --pull -t modular/mojo-v0.2-`date '+%Y%d%m-%H%M'` \
+#    --file Dockerfile.mojosdk .
+FROM ubuntu:20.04
+ARG DEFAUL_TZ=America/Los_Angeles
+ENV DEFAULT_TZ=$DEFAULT_TZ
+ARG MODULAR_HOME=/home/user/.modular
+ENV MODULAR_HOME=$MODULAR_HOME
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive $DEFAULT_TZ apt-get install -y \
+    tzdata \
+    vim \
+    sudo \
+    curl \
+    python3 \
+    pip \
+    wget \
+    && python3 -m pip install \
+    jupyterlab \
+    ipykernel \
+    matplotlib \
+    ipywidgets \
+    gradio
+RUN curl -fsSL https://repo.anaconda.com/miniconda/Miniconda3-py38_23.5.2-0-Linux-x86_64.sh > /tmp/miniconda.sh \
+    && chmod +x /tmp/miniconda.sh \
+    && /tmp/miniconda.sh -b -p /opt/conda
+ARG AUTH_KEY=DEFAULT_KEY
+ENV AUTH_KEY=$AUTH_KEY
+RUN curl https://get.modular.com | MODULAR_AUTH=$AUTH_KEY sh - \
+    && modular install mojo
+RUN useradd -m -u 1000 user
+RUN chown -R user $MODULAR_HOME
+ENV PATH="$PATH:/opt/conda/bin:$MODULAR_HOME/pkg/packages.modular.com_mojo/bin"
+RUN conda init
+RUN pip install gradio
+USER user
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+RUN wget -c  https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
+# CMD ["mojo", "llama2.mojo"]
+CMD ["python3", "gradio_app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Aydyn Tairov
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

assets/llama2.mojo-demo.gif ADDED Viewed

Git LFS Details

SHA256: 6801c1dca3aacf420459f4c7e20820a7a90dd75cc46f7347b18c87aa913fa7d6
Pointer size: 132 Bytes
Size of remote file: 2.49 MB

gradio_app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import gradio as gr
+import subprocess
+import sys
+import os
+async def generate(prompt):
+    # os.environ["PROMPT"] = prompt
+    # stream stout
+    process = subprocess.Popen(
+        ["mojo", "llama2.mojo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    text = ""
+    for char in iter(lambda: process.stdout.read(1), b""):
+        char_decoded = char.decode()
+        sys.stdout.write(char_decoded)
+        text += char_decoded
+        yield text
+output_text = gr.Textbox(label="Generated Text")
+demo = gr.Interface(
+    fn=generate,
+    inputs=None,
+    outputs=output_text,
+    description="""
+# llama2.🔥
+## [Mojo](https://docs.modular.com/mojo/) implementation of [llama2.c](https://github.com/karpathy/llama2.c) by [@tairov](https://github.com/tairov)
+Source: https://github.com/tairov/llama2.mojo
+    """,
+    allow_flagging="never",
+)
+demo.queue()
+demo.launch(server_name="0.0.0.0")

llama2.mojo ADDED Viewed

	@@ -0,0 +1,747 @@

+from math import round
+import math
+from memory import memset_zero, memcpy
+from memory.unsafe import DTypePointer
+from random import rand
+from sys.info import simdwidthof
+from builtin import string
+import time
+import random
+import os
+from runtime.llcl import num_cores
+from read import BufReader, File
+from memory.buffer import Buffer
+from python import Python
+# The SIMD vector width.
+from algorithm import vectorize, parallelize
+from algorithm import sum
+alias nelts = (2 * simdwidthof[DType.float32]())
+alias PointerString = Pointer[UInt8]
+alias BufferPtrType = DTypePointer[DType.uint8]
+alias BufferPtrFloat32 = DTypePointer[DType.float32]
+alias PointerStrings = Pointer[PointerString]
+struct Matrix3:
+    var data: BufferPtrFloat32
+    var rows: Int
+    var cols: Int
+    var layers: Int
+    var allocated: Int
+    fn __init__(inout self, layers: Int, rows: Int, cols: Int):
+        self.data = BufferPtrFloat32.alloc(0)
+        self.rows = rows
+        self.cols = cols
+        self.layers = layers
+        self.allocated = 0
+    @always_inline
+    fn alloc(inout self, fill: Int = 0):
+        self.data = BufferPtrFloat32.alloc(self.size())
+        self.allocated = 1
+        if fill == 1:
+            self.zero()
+    @always_inline
+    fn alloc_zero(inout self):
+        self.alloc(1)
+    @always_inline
+    fn set_buf_ptr(inout self, ptr: BufferPtrFloat32):
+        self.data = ptr
+    fn __del__(owned self):
+        if self.allocated == 1:
+            self.data.free()
+    @always_inline
+    fn zero(inout self):
+        memset_zero(self.data, self.layers * self.rows * self.cols)
+    @always_inline
+    fn size(inout self) -> Int:
+        return self.layers * self.cols * self.rows
+    @always_inline
+    fn __getitem__(self, z: Int, y: Int, x: Int) -> Float32:
+        return self.load[1](z, y, x)
+    @always_inline
+    fn load[nelts: Int](self, z: Int, y: Int, x: Int) -> SIMD[DType.float32, nelts]:
+        return self.data.simd_load[nelts](z * self.layers + y * self.cols + x)
+    @always_inline
+    fn __setitem__(self, z: Int, y: Int, x: Int, val: Float32):
+        return self.store[1](z, y, x, val)
+    @always_inline
+    fn store[nelts: Int](self, z: Int, y: Int, x: Int, val: SIMD[DType.float32, nelts]):
+        self.data.simd_store[nelts](z * self.layers + y * self.cols + x, val)
+struct Matrix:
+    var data: BufferPtrFloat32
+    var rows: Int
+    var cols: Int
+    var allocated: Int
+    fn __init__(inout self, rows: Int, cols: Int):
+        self.data = BufferPtrFloat32.alloc(0)
+        self.rows = rows
+        self.cols = cols
+        self.allocated = 0
+    fn __init__(inout self, cols: Int):
+        self.data = BufferPtrFloat32.alloc(0)
+        self.rows = 1
+        self.cols = cols
+        self.allocated = 0
+    fn __del__(owned self):
+        if self.allocated == 1:
+            self.data.free()
+    fn alloc(inout self, fill: Int = 0):
+        self.data = BufferPtrFloat32.alloc(self.size())
+        self.allocated = 1
+        if fill == 1:
+            self.zero()
+    fn alloc_zero(inout self):
+        self.alloc(1)
+    fn zero(inout self):
+        memset_zero(self.data, self.rows * self.cols)
+    fn set_buf_ptr(inout self, ptr: BufferPtrFloat32):
+        self.data = ptr
+    # set buf ptr with redefined rows, colss
+    fn set_buf_ptr(inout self, ptr: BufferPtrFloat32, rows: Int, cols: Int):
+        self.data = ptr
+        self.rows = rows
+        self.cols = cols
+    fn size(inout self) -> Int:
+        return self.cols * self.rows
+    @always_inline
+    fn __getitem__(self, y: Int, x: Int) -> Float32:
+        return self.load[1](y, x)
+    @always_inline
+    fn __getitem__(self, x: Int) -> Float32:
+        return self.load[1](0, x)
+    @always_inline
+    fn load[nelts: Int](self, y: Int, x: Int) -> SIMD[DType.float32, nelts]:
+        return self.data.simd_load[nelts](y * self.cols + x)
+    @always_inline
+    fn __setitem__(self, y: Int, x: Int, val: Float32):
+        return self.store[1](y, x, val)
+    @always_inline
+    fn __setitem__(self, x: Int, val: Float32):
+        return self.store[1](0, x, val)
+    @always_inline
+    fn store[nelts: Int](self, y: Int, x: Int, val: SIMD[DType.float32, nelts]):
+        self.data.simd_store[nelts](y * self.cols + x, val)
+    @always_inline
+    fn load[nelts: Int](self, x: Int) -> SIMD[DType.float32, nelts]:
+        return self.data.simd_load[nelts](x)
+    @always_inline
+    fn store[nelts: Int](self, x: Int, val: SIMD[DType.float32, nelts]):
+        self.data.simd_store[nelts](x, val)
+fn read_val_int(inout buf: FileBuf) -> Int:
+    # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
+    let data = buf.data.offset(buf.offset).bitcast[DType.uint32]()
+    let result = data.simd_load[1](0)
+    buf.offset += 4
+    return result.to_int()
+fn read_val_float32(inout buf: FileBuf) -> Float32:
+    # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
+    let val = buf.data.offset(buf.offset).bitcast[DType.float32]().simd_load[1](0)
+    buf.offset += 4
+    return val
+fn read_val_str(inout buf: FileBuf, slen: Int) -> PointerString:
+    let str = PointerString.alloc(slen + 1)
+    for i in range(slen):
+        str.store(i, buf.data.simd_load[1](buf.offset))
+        buf.offset += 1
+    str.store(slen, 0)
+    return str
+struct FileBuf:
+    var data: BufferPtrType
+    var offset: Int
+    var size: Int
+    fn __init__(inout self):
+        self.data = BufferPtrType()
+        self.offset = 0
+        self.size = 0
+    fn move_offset(inout self, size: Int):
+        self.offset += size
+    fn bitcast_offset_float32(inout self, size: Int) -> BufferPtrFloat32:
+        let ret = self.data.offset(self.offset).bitcast[DType.float32]()
+        self.offset += size * sizeof[DType.float32]()
+        return ret
+struct Tokenizer:
+    var vocab: PointerStrings
+    var vocab_scores: BufferPtrFloat32
+    var max_token_length: Int
+    var vocab_size: Int
+    fn __init__(inout self, vocab_size: Int):
+        self.vocab_size = vocab_size
+        self.vocab = PointerStrings.alloc(vocab_size)
+        self.vocab_scores = BufferPtrFloat32.alloc(vocab_size)
+        self.max_token_length = 0
+struct Config:
+    var dim: Int
+    var hidden_dim: Int
+    var n_layers: Int
+    var n_heads: Int
+    var n_kv_heads: Int
+    var vocab_size: Int
+    var seq_len: Int
+    fn __init__(inout self):
+        self.dim = 0
+        self.hidden_dim = 0
+        self.n_layers = 0
+        self.n_heads = 0
+        self.n_kv_heads = 0
+        self.vocab_size = 0
+        self.seq_len = 0
+struct RunState:
+    var x: Matrix  # activation at current time stamp (dim,)
+    var xb: Matrix  # same, but inside a residual branch (dim,)
+    var xb2: Matrix  # an additional buffer just for convenience (dim,)
+    var hb: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
+    var hb2: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
+    var q: Matrix  # query (dim,)
+    var k: Matrix  # key (dim,)
+    var v: Matrix  # value (dim,)
+    var att: Matrix  # buffer for scores/attention values (n_heads, seq_len)
+    var logits: Matrix  # output logits
+    var key_cache: Matrix3  # (layer, seq_len, dim)
+    var value_cache: Matrix3  # (layer, seq_len, dim)
+    fn __init__(inout self, config: Config):
+        self.x = Matrix(config.dim)
+        self.x.alloc_zero()
+        self.xb = Matrix(config.dim)
+        self.xb.alloc_zero()
+        self.xb2 = Matrix(config.dim)
+        self.xb2.alloc_zero()
+        self.hb = Matrix(config.hidden_dim)
+        self.hb.alloc_zero()
+        self.hb2 = Matrix(config.hidden_dim)
+        self.hb2.alloc_zero()
+        self.q = Matrix(config.dim)
+        self.q.alloc_zero()
+        self.k = Matrix(config.dim)
+        self.k.alloc_zero()
+        self.v = Matrix(config.dim)
+        self.v.alloc_zero()
+        self.att = Matrix(config.n_heads, config.seq_len)
+        self.att.alloc_zero()
+        self.logits = Matrix(config.vocab_size)
+        self.logits.alloc_zero()
+        self.key_cache = Matrix3(config.n_layers, config.seq_len, config.dim)
+        self.key_cache.alloc_zero()
+        self.value_cache = Matrix3(config.n_layers, config.seq_len, config.dim)
+        self.value_cache.alloc_zero()
+struct TransformerWeights:
+    var token_embedding_table: Matrix
+    var freq_cis_real: Matrix
+    var freq_cis_imag: Matrix
+    var rms_att_weight: Matrix
+    var wq: Matrix3
+    var wk: Matrix3
+    var wv: Matrix3
+    var wo: Matrix3
+    var rms_ffn_weight: Matrix
+    var w1: Matrix3
+    var w3: Matrix3
+    var w2: Matrix3
+    var rms_final_weight: Matrix
+    var wcls: Matrix
+    fn __init__(inout self, config: Config, shared_weights: Int, inout buf: FileBuf):
+        self.token_embedding_table = Matrix(config.vocab_size, config.dim)
+        # set buf ptr to buf data from file
+        self.token_embedding_table.set_buf_ptr(
+            buf.bitcast_offset_float32(self.token_embedding_table.size())
+        )
+        self.rms_att_weight = Matrix(config.n_layers, config.dim)
+        self.rms_att_weight.set_buf_ptr(
+            buf.bitcast_offset_float32(self.rms_att_weight.size())
+        )
+        self.wq = Matrix3(config.n_layers, config.dim, config.dim)
+        self.wq.set_buf_ptr(buf.bitcast_offset_float32(self.wq.size()))
+        self.wk = Matrix3(config.n_layers, config.dim, config.dim)
+        self.wk.set_buf_ptr(buf.bitcast_offset_float32(self.wk.size()))
+        self.wv = Matrix3(config.n_layers, config.dim, config.dim)
+        self.wv.set_buf_ptr(buf.bitcast_offset_float32(self.wv.size()))
+        self.wo = Matrix3(config.n_layers, config.dim, config.dim)
+        self.wo.set_buf_ptr(buf.bitcast_offset_float32(self.wo.size()))
+        self.rms_ffn_weight = Matrix(config.n_layers, config.dim)
+        self.rms_ffn_weight.set_buf_ptr(
+            buf.bitcast_offset_float32(self.rms_ffn_weight.size())
+        )
+        self.w1 = Matrix3(config.n_layers, config.dim, config.hidden_dim)
+        self.w1.set_buf_ptr(buf.bitcast_offset_float32(self.w1.size()))
+        self.w2 = Matrix3(config.n_layers, config.dim, config.hidden_dim)
+        self.w2.set_buf_ptr(buf.bitcast_offset_float32(self.w2.size()))
+        self.w3 = Matrix3(config.n_layers, config.dim, config.hidden_dim)
+        self.w3.set_buf_ptr(buf.bitcast_offset_float32(self.w3.size()))
+        self.rms_final_weight = Matrix(config.dim)
+        self.rms_final_weight.set_buf_ptr(
+            buf.bitcast_offset_float32(self.rms_final_weight.size())
+        )
+        self.freq_cis_real = Matrix(config.seq_len, (config.dim // config.n_heads) // 2)
+        self.freq_cis_real.set_buf_ptr(
+            buf.bitcast_offset_float32(self.freq_cis_real.size())
+        )
+        self.freq_cis_imag = Matrix(config.seq_len, (config.dim // config.n_heads) // 2)
+        self.freq_cis_imag.set_buf_ptr(
+            buf.bitcast_offset_float32(self.freq_cis_imag.size())
+        )
+        self.wcls = Matrix(
+            config.vocab_size, config.dim
+        )  # if shared_weights else rest_floats
+        self.wcls.set_buf_ptr(self.token_embedding_table.data)
+fn read_file(file_name: String, inout buf: FileBuf) raises:
+    let _os = Python.import_module("os")
+    let ff_size = _os.path.getsize(file_name)
+    let cp_size = string.atol(ff_size.to_string())
+    let cp_buf: BufferPtrType = BufferPtrType.alloc(cp_size)
+    # set window buffer to read binary data from file
+    let f = File(file_name)
+    var reader = BufReader[4096](f ^)
+    var bytes_read = 1
+    var offset = 0
+    while bytes_read > 0:
+        let buf = Buffer[4096, DType.uint8](cp_buf.offset(offset))
+        bytes_read = reader.read(buf)
+        offset += bytes_read
+    reader.do_nothing()  # keeps lifetimes working
+    buf.data = cp_buf
+    buf.size = cp_size
+    buf.offset = 0
+    return None
+fn config_init(inout config: Config, inout buf: FileBuf) raises:
+    config.dim = read_val_int(buf)
+    config.hidden_dim = read_val_int(buf)
+    config.n_layers = read_val_int(buf)
+    config.n_heads = read_val_int(buf)
+    config.n_kv_heads = read_val_int(buf)
+    config.vocab_size = read_val_int(buf)
+    config.seq_len = read_val_int(buf)
+    return None
+fn tokenizer_init(inout tok: Tokenizer, inout buf: FileBuf) -> None:
+    tok.max_token_length = read_val_int(buf)
+    tok.vocab_scores = BufferPtrFloat32.alloc(tok.vocab_size)
+    tok.vocab = PointerStrings.alloc(tok.vocab_size)
+    # read vocab_scores & vocab values (tokens)
+    for i in range(0, tok.vocab_size):
+        tok.vocab_scores.simd_store[1](i, read_val_float32(buf))
+        let slen = read_val_int(buf)
+        tok.vocab.store(i, read_val_str(buf, slen))
+    tok.vocab_scores = buf.data.offset(buf.offset).bitcast[DType.float32]()
+    buf.offset += tok.vocab_size * 4
+    return None
+fn accum(inout a: BufferPtrFloat32, b: BufferPtrFloat32, size: Int) -> None:
+    for i in range(size):
+        let val = a.offset(i).simd_load[1](0) + b.offset(i).simd_load[1](0)
+        a.offset(i).simd_store[1](0, val)
+fn rmsnorm(
+    inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
+) -> None:
+    # Calculate sum of squares
+    var ss: Float32 = 0.0
+    for i in range(size):
+        let xx = x.offset(i).simd_load[1](0) ** 2
+        ss += xx
+    ss = ss / size + 1e-5
+    ss = 1.0 / math.sqrt(ss)
+    # Normalize and scale
+    for j in range(size):
+        let val = weight.offset(j).simd_load[1](0) * (ss * x.offset(j).simd_load[1](0))
+        o.offset(j).simd_store[1](0, val)
+fn softmax(inout x: BufferPtrFloat32, size: Int) -> None:
+    # Find max value (for numerical stability)
+    var max_val: Float32 = x.offset(0).simd_load[1](0)
+    for i in range(size):
+        let xi = x.offset(i).simd_load[1](0)
+        if xi > max_val:
+            max_val = xi
+    # Exp and sum
+    var ssum: Float32 = 0.0
+    for i in range(size):
+        let xi = x.offset(i).simd_load[1](0)
+        x.offset(i).simd_store[1](0, math.exp(xi - max_val))
+        ssum += x.offset(i).simd_load[1](0)
+    # Normalize
+    for i in range(size):
+        let xi = x.offset(i).simd_load[1](0)
+        x.offset(i).simd_store[1](0, xi / ssum)
+fn matmul_naive(C: Matrix, x: Matrix, w: Matrix) -> None:
+    # W(d,n) @ X(n,) -> C (d,)
+    # By far the most amount of time is spent inside this little function
+    for i in range(w.rows):
+        C[i] = 0.0
+        for j in range(w.cols):
+            C[i] += x[j] * w[i, j]
+fn matmul_vectorized(C: Matrix, A: Matrix, B: Matrix):
+    for i in range(0, B.rows):
+        var tmp = SIMD[DType.float32, nelts](0)
+        @parameter
+        fn dot[_nelts: Int](j: Int):
+            if _nelts < nelts: # take care of tail array elements with length <  nelts
+                tmp[0] += (A.load[_nelts](j) * B.load[_nelts](i, j)).reduce_add()
+            else:
+                tmp += A.load[nelts](j) * B.load[nelts](i, j)
+        vectorize[nelts, dot](B.cols)
+        C[i] = tmp.reduce_add()
+fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix):
+    @parameter
+    fn calc_row(i: Int):
+        var T = BufferPtrFloat32.alloc(nelts)
+        var Tbuf = Buffer[nelts, DType.float32](T)
+        memset_zero(T, nelts)
+        @parameter
+        fn dot[nelts: Int](j: Int):
+            T.simd_store[nelts](
+                0, T.simd_load[nelts](0) + A.load[nelts](j) * B.load[nelts](i, j)
+            )
+        vectorize[nelts, dot](B.cols)
+        C[i] = sum[nelts, DType.float32](Tbuf)
+    parallelize[calc_row](B.rows)
+fn matmul(inout C: Matrix, A: Matrix, B: Matrix) -> None:
+    # B (d,n) @ A (n,) -> C (d,)
+    matmul_vectorized(C, A, B)
+    # matmul_parallelized(C, A, B)
+fn transformer(
+    token: Int,
+    pos: Int,
+    config: Config,
+    inout state: RunState,
+    weights: TransformerWeights,
+) -> None:
+    # A few convenience variables
+    var x = state.x.data
+    let dim = config.dim
+    let hidden_dim = config.hidden_dim
+    let head_size = dim // config.n_heads
+    # tmp matrix for matmul operations
+    var tmpw = Matrix(0, 0)
+    # Copy the token embedding into x
+    let content_row = weights.token_embedding_table.data.offset(token * dim)
+    memcpy[DType.float32](x, content_row, config.dim)
+    # Pluck out the "pos" row of freq_cis_real and freq_cis_imag
+    let freq_cis_real_row = weights.freq_cis_real.data.offset(pos * head_size // 2)
+    let freq_cis_imag_row = weights.freq_cis_imag.data.offset(pos * head_size // 2)
+    # Forward all the layers
+    for l in range(config.n_layers):
+        # Attention rmsnorm
+        rmsnorm(state.xb.data, x, weights.rms_att_weight.data.offset(l * dim), dim)
+        # QKV matmuls for this position
+        tmpw.set_buf_ptr(weights.wq.data.offset(l * dim * dim), dim, dim)
+        matmul(state.q, state.xb, tmpw)
+        tmpw.set_buf_ptr(weights.wk.data.offset(l * dim * dim), dim, dim)
+        matmul(state.k, state.xb, tmpw)
+        tmpw.set_buf_ptr(weights.wv.data.offset(l * dim * dim), dim, dim)
+        matmul(state.v, state.xb, tmpw)
+        # Apply RoPE rotation to the q and k vectors for each head
+        for h in range(config.n_heads):
+            # Get the q and k vectors for this head
+            let q = state.q.data.offset(h * head_size)
+            let k = state.k.data.offset(h * head_size)
+            # Rotate q and k by the freq_cis_real and freq_cis_imag
+            for i in range(0, head_size, 2):
+                let q0 = q.offset(i).simd_load[1](0)
+                let q1 = q.offset(i + 1).simd_load[1](0)
+                let k0 = k.offset(i).simd_load[1](0)
+                let k1 = k.offset(i + 1).simd_load[1](0)
+                let fcr = freq_cis_real_row.offset(i // 2).simd_load[1](0)
+                let fci = freq_cis_imag_row.offset(i // 2).simd_load[1](0)
+                q.offset(i).simd_store[1](0, q0 * fcr - q1 * fci)
+                q.offset(i + 1).simd_store[1](0, q0 * fci + q1 * fcr)
+                k.offset(i).simd_store[1](0, k0 * fcr - k1 * fci)
+                k.offset(i + 1).simd_store[1](0, k0 * fci + k1 * fcr)
+        # Save key,value at this time step (pos) to our kv cache
+        let loff = l * config.seq_len * dim  # kv cache layer offset for convenience
+        let key_cache_row = state.key_cache.data.offset(loff + pos * dim)
+        let value_cache_row = state.value_cache.data.offset(loff + pos * dim)
+        memcpy[DType.float32](key_cache_row, state.k.data, config.dim)
+        memcpy[DType.float32](value_cache_row, state.v.data, config.dim)
+        # Multihead attention. Iterate over all heads
+        for h in range(config.n_heads):
+            # Get the query vector for this head
+            let q = state.q.data.offset(h * head_size)
+            # Attention scores for this head
+            var att = state.att.data.offset(h * config.seq_len)
+            # Iterate over all timesteps, including the current one
+            for t in range(pos + 1):
+                # Get the key vector for this head and at this timestep
+                let k = state.key_cache.data.offset(loff + t * dim + h * head_size)
+                # Calculate the attention score as the dot product of q and k
+                var score: Float32 = 0.0
+                for i in range(head_size):
+                    score += q.offset(i).simd_load[1](0) * k.offset(i).simd_load[1](0)
+                score /= math.sqrt[DType.float32, 1](head_size)
+                # Save the score to the attention buffer
+                att.offset(t).simd_store[1](0, score)
+            # Softmax the scores to get attention weights, from 0..pos inclusively
+            softmax(att, pos + 1)
+            # Weighted sum of the values, store back into xb
+            let xb = state.xb.data.offset(h * head_size)
+            memset_zero(xb, head_size)
+            for t in range(pos + 1):
+                # Get the value vector for this head and at this timestep
+                let v = state.value_cache.data.offset(loff + t * dim + h * head_size)
+                # Get the attention weight for this timestep
+                let a = att.offset(t).simd_load[1](0)
+                # Accumulate the weighted value into xb
+                for i in range(head_size):
+                    let xbi = xb.offset(i).simd_load[1](0) + a * v.offset(i).simd_load[
+                        1
+                    ](0)
+                    xb.offset(i).simd_store[1](0, xbi)
+        # Final matrix multiplication to get the output of the attention
+        tmpw.set_buf_ptr(weights.wo.data.offset(l * dim * dim), dim, dim)
+        matmul(state.xb2, state.xb, tmpw)
+        # Residual connection back into x
+        accum(x, state.xb2.data, dim)
+        # FFN rmsnorm
+        rmsnorm(state.xb.data, x, weights.rms_ffn_weight.data.offset(l * dim), dim)
+        # Calculate self.w1(x) and self.w3(x) for FFN
+        tmpw.set_buf_ptr(weights.w1.data.offset(l * dim * hidden_dim), hidden_dim, dim)
+        matmul(state.hb, state.xb, tmpw)
+        tmpw.set_buf_ptr(weights.w3.data.offset(l * dim * hidden_dim), hidden_dim, dim)
+        matmul(state.hb2, state.xb, tmpw)
+        # Apply SiLU activation function (silu(x) = x * sigmoid(x))
+        for i in range(hidden_dim):
+            let hbi = state.hb[i]
+            state.hb[i] = hbi * (1.0 / (1.0 + math.exp(-hbi)))
+        # Elementwise multiply with w3(x)
+        for i in range(hidden_dim):
+            state.hb[i] = state.hb[i] * state.hb2[i]
+        # Final matrix multiplication to get the output of the FFN
+        tmpw.set_buf_ptr(weights.w2.data.offset(l * dim * hidden_dim), dim, hidden_dim)
+        matmul(state.xb, state.hb, tmpw)
+        # Residual connection
+        accum(x, state.xb.data, dim)
+    # Final rmsnorm
+    rmsnorm(x, x, weights.rms_final_weight.data, dim)
+    # Classifier into logits
+    tmpw.set_buf_ptr(weights.wcls.data, config.vocab_size, dim)
+    matmul(state.logits, state.x, tmpw)
+fn argmax(v: Matrix) -> Int:
+    # return argmax of v
+    var max_i: Int = 0
+    var max_p: Float32 = v[0]
+    for i in range(v.cols):
+        if v[i] > max_p:
+            max_i = i
+            max_p = v[i]
+    return max_i
+fn sample(probabilities: Matrix) -> Int:
+    let n = probabilities.cols
+    # Sample index from probabilities, they must sum to 1
+    # get random value within (min, max) float32 range
+    let r = DTypePointer[DType.float32].alloc(1)
+    rand[DType.float32](r, 1)
+    var cdf: Float32 = 0.0
+    for i in range(n):
+        cdf += probabilities[i]
+        if r.simd_load[1](0) < cdf:
+            return i
+    return n - 1  # In case of rounding errors
+fn print_str(s: PointerString):
+    # print all chars till null character
+    var p: Int = 0
+    while s[p].to_int() != 0:
+        print_no_newline(chr(s[p].to_int()))
+        p += 1
+fn time_in_ms() -> Int:
+    # Returns time in milliseconds for benchmarking the model speed
+    return time.now() // 1_000_000
+fn main() raises:
+    print("num hardware threads: ", num_cores(), " SIMD vector width: ", nelts)
+    let checkpoint = "stories15M.bin"
+    # let checkpoint = "stories110M.bin"
+    let tokenizer = "tokenizer.bin"
+    let temperature = 0.0
+    var steps = 256
+    let prompt = ""
+    let rng_seed: Int = time.now()
+    random.seed(rng_seed)
+    var fbuf: FileBuf = FileBuf()
+    var tbuf: FileBuf = FileBuf()
+    var config: Config = Config()
+    read_file(checkpoint, fbuf)
+    print("checkpoint size: ", fbuf.size)
+    config_init(config, fbuf)
+    # negative vocab size is hacky way of signaling unshared weights. bit yikes.
+    let shared_weights = 1 if config.vocab_size > 0 else 0
+    config.vocab_size = (
+        -config.vocab_size if config.vocab_size < 0 else config.vocab_size
+    )
+    let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)
+    var tok: Tokenizer = Tokenizer(config.vocab_size)
+    if steps <= 0 or steps > config.seq_len:
+        steps = config.seq_len
+    # Read in the tokenizer.bin file
+    read_file(tokenizer, tbuf)
+    tokenizer_init(tok, tbuf)
+    # Create and initialize the application RunState
+    var state = RunState(config)
+    # Start the main loop
+    var start = 0  # Used to time our code, only initialized after the first iteration
+    var next_token = 0  # Will store the next token in the sequence
+    # Initialize with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
+    var token = 1
+    var pos = 0  # Position in the sequence
+    # Explicitly print the initial BOS token for stylistic symmetry reasons
+    print("<s>")
+    while pos < steps:
+        # Forward the transformer to get logits for the next token
+        transformer(token, pos, config, state, weights)
+        # Sample the next token
+        if temperature == 0.0:
+            # Greedy argmax sampling: take the token with the highest probability
+            next_token = argmax(state.logits)
+        else:
+            # Apply the temperature to the logits
+            for q in range(config.vocab_size):
+                state.logits[q] = state.logits[q] / temperature
+            # Apply softmax to the logits to get the probabilities for the next token
+            softmax(state.logits.data, config.vocab_size)
+            # Sample from this distribution to get the next token
+            next_token = sample(state.logits)
+        var token_str: PointerString = tok.vocab[next_token]
+        if token == 1 and token_str[0] == ord(" "):
+            token_str = token_str.offset(1)
+        print_str(token_str)
+        # flush?
+        # Advance forward
+        token = next_token
+        pos += 1
+        if start == 0:
+            start = time_in_ms()
+    let end = time_in_ms()
+    print("\nachieved tok/s: ", (steps - 1) / (end - start) * 1000)

read/__init__.mojo ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+This is a port of Zig's buffered reader.
+See: https://github.com/ziglang/zig/blob/master/lib/std/io/buffered_reader.zig
+The MIT License (Expat)
+Copyright (c) Lukas Hermann
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+from utils.list import Dim
+from math import min
+from math.limit import max_finite
+from memory import memcpy
+from memory.buffer import Buffer
+from memory.unsafe import Pointer, DTypePointer
+from sys.info import sizeof
+from utils.index import Index
+from utils.vector import DynamicVector
+import testing
+from .libc.stdio import fopen, fread, fclose, FILE
+from .libc.dirent import readdir, opendir, closedir, DIR, dirent
+from .libc.string import strnlen
+alias BUF_SIZE = 4096
+# Types aliases
+alias c_char = UInt8
+fn to_char_ptr(s: String) -> Pointer[c_char]:
+    """Only ASCII-based strings."""
+    let ptr = Pointer[c_char]().alloc(len(s) + 1)
+    for i in range(len(s)):
+        ptr.store(i, ord(s[i]))
+    ptr.store(len(s), ord("\0"))
+    return ptr
+struct File:
+    var handle: Pointer[FILE]
+    var fname: Pointer[c_char]
+    var mode: Pointer[c_char]
+    fn __init__(inout self, filename: String):
+        let fname = to_char_ptr(filename)
+        let mode = to_char_ptr("r")
+        let handle = fopen(fname, mode)
+        self.fname = fname
+        self.mode = mode
+        self.handle = handle
+    fn __bool__(self) -> Bool:
+        return self.handle.__bool__()
+    fn __del__(owned self) raises:
+        if self.handle:
+            pass
+            # let c = fclose(self.handle)
+            # if c != 0:
+            #     raise Error("Failed to close file")
+        if self.fname:
+            self.fname.free()
+        if self.mode:
+            self.mode.free()
+    fn __moveinit__(inout self, owned other: Self):
+        self.fname = other.fname
+        self.mode = other.mode
+        self.handle = other.handle
+        other.handle = Pointer[FILE]()
+        other.fname = Pointer[c_char]()
+        other.mode = Pointer[c_char]()
+    fn do_nothing(self):
+        pass
+    fn read[D: Dim](self, buffer: Buffer[D, DType.uint8]) raises -> Int:
+        return fread(
+            buffer.data.as_scalar_pointer(), sizeof[UInt8](), BUF_SIZE, self.handle
+        ).to_int()
+struct DirEntry:
+    var _pointer: Pointer[dirent]
+    var name: String
+    fn __init__(inout self, pointer: Pointer[dirent]):
+        self.name = String()
+        if pointer:
+            print("hit")
+            let name_ptr = pointer.bitcast[UInt8]().offset(
+                sizeof[UInt64]() * 2 + sizeof[UInt16]() + sizeof[UInt8]()
+            )
+            let name_len = strnlen(name_ptr)
+            for i in range(name_len):
+                self.name += chr(name_ptr.load(i).to_int())
+        self._pointer = pointer
+@value
+@register_passable("trivial")
+struct DirIter:
+    var handle: Pointer[DIR]
+    var data: Pointer[dirent]
+    fn __iter__(inout self) -> Self:
+        self.data = readdir(self.handle)
+    fn __next__(self) raises -> Pointer[dirent]:
+        return self.data
+    fn __len__(self) -> Int:
+        if self.handle and self.data:
+            return 1
+        return 0
+struct Dir:
+    var handle: Pointer[DIR]
+    var path: Pointer[c_char]
+    fn __init__(inout self, path: String):
+        self.path = to_char_ptr(path)
+        self.handle = opendir(self.path)
+    fn __bool__(self) -> Bool:
+        return self.handle.__bool__()
+    fn __iter__(self) -> DirIter:
+        return DirIter(self.handle, Pointer[dirent]())
+    fn __del__(owned self) raises:
+        let c = closedir(self.handle)
+        if c != 0:
+            raise Error("failed to close dir")
+        self.path.free()
+    fn do_nothing(self):
+        pass
+struct BufReader[BUF_SIZE: Int]:
+    var unbuffered_reader: File
+    var data: DTypePointer[DType.uint8]
+    var end: Int
+    var start: Int
+    fn __init__(inout self, owned reader: File):
+        self.unbuffered_reader = reader ^
+        self.data = DTypePointer[DType.uint8]().alloc(BUF_SIZE)
+        self.end = 0
+        self.start = 0
+    fn read[D: Dim](inout self, dest: Buffer[D, DType.uint8]) raises -> Int:
+        var dest_index = 0
+        let buf = Buffer[BUF_SIZE, DType.uint8](self.data)
+        while dest_index < len(dest):
+            let written = min(len(dest) - dest_index, self.end - self.start)
+            memcpy(dest.data.offset(dest_index), self.data.offset(self.start), written)
+            if written == 0:
+                # buf empty, fill it
+                let n = self.unbuffered_reader.read(buf)
+                if n == 0:
+                    # reading from the unbuffered stream returned nothing
+                    # so we have nothing left to read.
+                    return dest_index
+                self.start = 0
+                self.end = n
+            self.start += written
+            dest_index += written
+        return len(dest)
+    fn do_nothing(self):
+        pass

read/libc/__init__.mojo ADDED Viewed

File without changes

read/libc/dirent/__init__.mojo ADDED Viewed

	@@ -0,0 +1,56 @@

+from memory.unsafe import Pointer
+@value
+@register_passable("trivial")
+struct DIR:
+    pass
+@value
+@register_passable("trivial")
+struct dirent:
+    var d_ino: UInt64
+    var d_off: UInt64
+    var d_reclen: UInt16
+    var d_type: UInt8
+    var d_name: Pointer[UInt8]
+# ===--------------------------------------------------------------------===#
+# closedir
+# ===-----------------------------------------------------------------------===#
+fn closedir(arg: Pointer[DIR]) -> Int32:
+    return external_call["closedir", Int32, Pointer[DIR]](arg)
+# ===--------------------------------------------------------------------===#
+# opendir
+# ===-----------------------------------------------------------------------===#
+fn opendir(arg: Pointer[UInt8]) -> Pointer[DIR]:
+    return external_call["opendir", Pointer[DIR], Pointer[UInt8]](arg)
+# ===--------------------------------------------------------------------===#
+# readdir
+# ===-----------------------------------------------------------------------===#
+fn readdir(arg: Pointer[DIR]) -> Pointer[dirent]:
+    return external_call["readdir", Pointer[dirent], Pointer[DIR]](arg)
+# ===--------------------------------------------------------------------===#
+# fdopendir
+# ===-----------------------------------------------------------------------===#
+fn fdopendir(arg: Int32) -> DIR:
+    return external_call["fdopendir", DIR](arg)
+#

read/libc/stdio/__init__.mojo ADDED Viewed

	@@ -0,0 +1,85 @@

+from memory.unsafe import Pointer
+@value
+@register_passable("trivial")
+struct FILE:
+    pass
+# ===--------------------------------------------------------------------===#
+# clearerr
+# ===-----------------------------------------------------------------------===#
+fn clearerr(arg: Pointer[FILE]) -> UInt8:
+    return external_call["clearerr", UInt8, Pointer[FILE]](arg)
+# ===--------------------------------------------------------------------===#
+# fclose
+# ===-----------------------------------------------------------------------===#
+fn fclose(arg: Pointer[FILE]) -> Int32:
+    return external_call["fclose", Int32](arg)
+# ===--------------------------------------------------------------------===#
+# feof
+# ===-----------------------------------------------------------------------===#
+fn feof(arg: Pointer[FILE]) -> Int32:
+    return external_call["feof", Int32, Pointer[FILE]](arg)
+# ===--------------------------------------------------------------------===#
+# ferror
+# ===-----------------------------------------------------------------------===#
+fn ferror(arg: Pointer[FILE]) -> Int32:
+    return external_call["ferror", Int32, Pointer[FILE]](arg)
+# ===--------------------------------------------------------------------===#
+# fflush
+# ===-----------------------------------------------------------------------===#
+fn fflush(arg: Pointer[FILE]) -> Int32:
+    return external_call["fflush", Int32, Pointer[FILE]](arg)
+# ===--------------------------------------------------------------------===#
+# fgetc
+# ===-----------------------------------------------------------------------===#
+fn fgetc(arg: Pointer[FILE]) -> Int32:
+    return external_call["fgetc", Int32, Pointer[FILE]](arg)
+# ===--------------------------------------------------------------------===#
+# fopen
+# ===-----------------------------------------------------------------------===#
+fn fopen(__filename: Pointer[UInt8], __mode: Pointer[UInt8]) -> Pointer[FILE]:
+    return external_call["fopen", Pointer[FILE], Pointer[UInt8], Pointer[UInt8]](
+        __filename, __mode
+    )
+# ===--------------------------------------------------------------------===#
+# fread
+# ===-----------------------------------------------------------------------===#
+fn fread(
+    __ptr: Pointer[UInt8], __size: UInt64, __nitems: UInt64, __stream: Pointer[FILE]
+) -> UInt64:
+    return external_call[
+        "fread", UInt64, Pointer[UInt8], UInt64, UInt64, Pointer[FILE]
+    ](__ptr, __size, __nitems, __stream)

read/libc/string/__init__.mojo ADDED Viewed

	@@ -0,0 +1,5 @@

+from memory.unsafe import Pointer
+fn strnlen(pointer: Pointer[UInt8]) -> Int:
+    return external_call["strnlen", Int, Pointer[UInt8]](pointer)

tokenizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74f899535f8f99cdeac697474b697f3fa40467b8376dab7b6c4d8c753f28ae9f
+size 432717