Aananda-giri commited on
Commit
9019f12
·
1 Parent(s): 47abc2a

debug tokenizer not found

Browse files
Files changed (1) hide show
  1. app.py +37 -34
app.py CHANGED
@@ -2,6 +2,42 @@
2
  import gradio as gr
3
 
4
  # ==================================================================================-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # ==================================================================================-
6
  # ==================================================================================-
7
  # ==================================================================================-
@@ -435,7 +471,7 @@ class ChatFormat:
435
  return self.tokenizer.decode(token_ids)
436
 
437
  # tokenizer = Tokenizer("tokenizer.json")
438
- chat_tokenizer = ChatFormat(tokenizer)
439
 
440
  # text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।"
441
  # # normal tokenizer
@@ -870,40 +906,7 @@ def generate_chat_optimized(
870
  # generate_and_print_sample
871
  # )
872
 
873
- # -------------------------------------------------
874
- # 1. Download the model weights (from huggingface)
875
- # -------------------------------------------------
876
- from huggingface_hub import hf_hub_download
877
- hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")
878
 
879
- # ----------------------
880
- # 2. Load The tokenizer
881
- # ----------------------
882
- from transformers import PreTrainedTokenizerFast
883
-
884
- # Load the tokenizer
885
- tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
886
- tokenizer.save_pretrained("NepaliBPE")
887
-
888
-
889
- # Llama 3.2 ~300M Scaled Version
890
- LLAMA32_CONFIG = {
891
- "vocab_size": 50006, # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
892
- "context_length": 512, # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
893
- "emb_dim": 1320, # 2048 reduced Embedding dimension
894
- "n_heads": 20, # 32 reduced Number of attention heads
895
- "n_layers": 10, # 16 reduced Number of layers
896
- "hidden_dim": 5280, # 8192 Size of the intermediate dimension in FeedForward
897
- "n_kv_groups": 5, # 8 Key-Value groups for grouped-query attention
898
- "rope_base": 500_000.0, # 500_000 The base in RoPE's "theta"
899
- "dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
900
- "rope_freq": { # RoPE frequency scaling
901
- "factor": 32.0,
902
- "low_freq_factor": 1.0,
903
- "high_freq_factor": 4.0,
904
- "original_context_length": 8192,
905
- }
906
- }
907
 
908
  old_context_length = 131_072 # original context length of llama3.2 model
909
  new_context_length = LLAMA32_CONFIG["context_length"] # 512 our new context length
 
2
  import gradio as gr
3
 
4
  # ==================================================================================-
5
+ # inference code part-1
6
+ # -------------------------------------------------
7
+ # 1. Download the model weights (from huggingface)
8
+ # -------------------------------------------------
9
+ from huggingface_hub import hf_hub_download
10
+ hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")
11
+
12
+ # ----------------------
13
+ # 2. Load The tokenizer
14
+ # ----------------------
15
+ from transformers import PreTrainedTokenizerFast
16
+
17
+ # Load the tokenizer
18
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
19
+ tokenizer.save_pretrained("NepaliBPE")
20
+
21
+
22
+ # Llama 3.2 ~300M Scaled Version
23
+ LLAMA32_CONFIG = {
24
+ "vocab_size": 50006, # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
25
+ "context_length": 512, # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
26
+ "emb_dim": 1320, # 2048 reduced Embedding dimension
27
+ "n_heads": 20, # 32 reduced Number of attention heads
28
+ "n_layers": 10, # 16 reduced Number of layers
29
+ "hidden_dim": 5280, # 8192 Size of the intermediate dimension in FeedForward
30
+ "n_kv_groups": 5, # 8 Key-Value groups for grouped-query attention
31
+ "rope_base": 500_000.0, # 500_000 The base in RoPE's "theta"
32
+ "dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
33
+ "rope_freq": { # RoPE frequency scaling
34
+ "factor": 32.0,
35
+ "low_freq_factor": 1.0,
36
+ "high_freq_factor": 4.0,
37
+ "original_context_length": 8192,
38
+ }
39
+ }
40
+
41
  # ==================================================================================-
42
  # ==================================================================================-
43
  # ==================================================================================-
 
471
  return self.tokenizer.decode(token_ids)
472
 
473
  # tokenizer = Tokenizer("tokenizer.json")
474
+ # chat_tokenizer = ChatFormat(tokenizer)
475
 
476
  # text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।"
477
  # # normal tokenizer
 
906
  # generate_and_print_sample
907
  # )
908
 
 
 
 
 
 
909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
 
911
  old_context_length = 131_072 # original context length of llama3.2 model
912
  new_context_length = LLAMA32_CONFIG["context_length"] # 512 our new context length