Spaces:

Aananda-giri
/

LLAMA3_Nepali_318M

Sleeping

App Files Files Community

Aananda-giri commited on Mar 15

Commit

9019f12

1 Parent(s): 47abc2a

debug tokenizer not found

Browse files

Files changed (1) hide show

app.py +37 -34

app.py CHANGED Viewed

@@ -2,6 +2,42 @@
 import gradio as gr
 # ==================================================================================-
 # ==================================================================================-
 # ==================================================================================-
 # ==================================================================================-
@@ -435,7 +471,7 @@ class ChatFormat:
         return self.tokenizer.decode(token_ids)
 # tokenizer = Tokenizer("tokenizer.json")
-chat_tokenizer = ChatFormat(tokenizer)
 # text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।"
 # # normal tokenizer
@@ -870,40 +906,7 @@ def generate_chat_optimized(
 #     generate_and_print_sample
 # )
-# -------------------------------------------------
-# 1. Download the model weights (from huggingface)
-# -------------------------------------------------
-from huggingface_hub import hf_hub_download
-hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")
-# ----------------------
-# 2. Load The tokenizer
-# ----------------------
-from transformers import PreTrainedTokenizerFast
-# Load the tokenizer
-tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
-tokenizer.save_pretrained("NepaliBPE")
-# Llama 3.2 ~300M Scaled Version
-LLAMA32_CONFIG = {
-    "vocab_size": 50006,       # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
-    "context_length": 512,      # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
-    "emb_dim": 1320,            # 2048 reduced Embedding dimension
-    "n_heads": 20,              # 32 reduced Number of attention heads
-    "n_layers": 10,             # 16 reduced Number of layers
-    "hidden_dim": 5280,         # 8192 Size of the intermediate dimension in FeedForward
-    "n_kv_groups": 5,           # 8 Key-Value groups for grouped-query attention
-    "rope_base": 500_000.0,     # 500_000 The base in RoPE's "theta"
-    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
-    "rope_freq": {              # RoPE frequency scaling
-        "factor": 32.0,
-        "low_freq_factor": 1.0,
-        "high_freq_factor": 4.0,
-        "original_context_length": 8192,
-    }
-}
 old_context_length = 131_072    # original context length of llama3.2 model
 new_context_length = LLAMA32_CONFIG["context_length"]  # 512 our new context length

 import gradio as gr
 # ==================================================================================-
+# inference code part-1
+# -------------------------------------------------
+# 1. Download the model weights (from huggingface)
+# -------------------------------------------------
+from huggingface_hub import hf_hub_download
+hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")
+# ----------------------
+# 2. Load The tokenizer
+# ----------------------
+from transformers import PreTrainedTokenizerFast
+# Load the tokenizer
+tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
+tokenizer.save_pretrained("NepaliBPE")
+# Llama 3.2 ~300M Scaled Version
+LLAMA32_CONFIG = {
+    "vocab_size": 50006,       # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
+    "context_length": 512,      # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
+    "emb_dim": 1320,            # 2048 reduced Embedding dimension
+    "n_heads": 20,              # 32 reduced Number of attention heads
+    "n_layers": 10,             # 16 reduced Number of layers
+    "hidden_dim": 5280,         # 8192 Size of the intermediate dimension in FeedForward
+    "n_kv_groups": 5,           # 8 Key-Value groups for grouped-query attention
+    "rope_base": 500_000.0,     # 500_000 The base in RoPE's "theta"
+    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
+    "rope_freq": {              # RoPE frequency scaling
+        "factor": 32.0,
+        "low_freq_factor": 1.0,
+        "high_freq_factor": 4.0,
+        "original_context_length": 8192,
+    }
+}
 # ==================================================================================-
 # ==================================================================================-
 # ==================================================================================-
         return self.tokenizer.decode(token_ids)
 # tokenizer = Tokenizer("tokenizer.json")
+# chat_tokenizer = ChatFormat(tokenizer)
 # text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।"
 # # normal tokenizer
 #     generate_and_print_sample
 # )
 old_context_length = 131_072    # original context length of llama3.2 model
 new_context_length = LLAMA32_CONFIG["context_length"]  # 512 our new context length