sam2ai commited on
Commit
2ef266f
·
1 Parent(s): 94690c2

Update app.py

Browse files

Adding Llama inference script for Odia

Files changed (1) hide show
  1. app.py +147 -4
app.py CHANGED
@@ -1,7 +1,150 @@
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ from peft import PeftModel
3
+ import torch
4
+ import transformers
5
  import gradio as gr
6
 
7
+ assert (
8
+ "LlamaTokenizer" in transformers._import_structure["models.llama"]
9
+ ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
10
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
11
 
12
+ tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
13
+
14
+ BASE_MODEL = "decapoda-research/llama-7b-hf"
15
+ LORA_WEIGHTS = "OdiaGenAI/odiagenAI-model-v0"
16
+
17
+ if torch.cuda.is_available():
18
+ device = "cuda"
19
+ else:
20
+ device = "cpu"
21
+
22
+ try:
23
+ if torch.backends.mps.is_available():
24
+ device = "mps"
25
+ except:
26
+ pass
27
+
28
+ if device == "cuda":
29
+ model = LlamaForCausalLM.from_pretrained(
30
+ BASE_MODEL,
31
+ load_in_8bit=False,
32
+ torch_dtype=torch.float16,
33
+ device_map="auto",
34
+ )
35
+ model = PeftModel.from_pretrained(
36
+ model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
37
+ )
38
+ elif device == "mps":
39
+ model = LlamaForCausalLM.from_pretrained(
40
+ BASE_MODEL,
41
+ device_map={"": device},
42
+ torch_dtype=torch.float16,
43
+ )
44
+ model = PeftModel.from_pretrained(
45
+ model,
46
+ LORA_WEIGHTS,
47
+ device_map={"": device},
48
+ torch_dtype=torch.float16,
49
+ )
50
+ else:
51
+ model = LlamaForCausalLM.from_pretrained(
52
+ BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
53
+ )
54
+ model = PeftModel.from_pretrained(
55
+ model,
56
+ LORA_WEIGHTS,
57
+ device_map={"": device},
58
+ )
59
+
60
+
61
+
62
+ def generate_prompt(instruction, input=None):
63
+ if input:
64
+ return f"""ନିମ୍ନରେ ଏକ ନିର୍ଦ୍ଦେଶନାମା ଯାହାକି ଏକ କାର୍ଯ୍ୟକୁ ବର୍ଣ୍ଣନା କରେ, ଏକ ଇନପୁଟ୍ ସହିତ ଯୋଡି ଯାହା ପରବର୍ତ୍ତୀ ପ୍ରସଙ୍ଗ ପ୍ରଦାନ କରେ | ଏକ ପ୍ରତିକ୍ରିୟା ଲେଖନ୍ତୁ ଯାହା ଅନୁରୋଧକୁ ସଠିକ୍ ଭାବରେ ସମାପ୍ତ କରେ |
65
+ ### ନିର୍ଦ୍ଦେଶ:
66
+ {instruction}
67
+ ### ଇନପୁଟ୍:
68
+ {input}
69
+ ### ପ୍ରତିକ୍ରିୟା:"""
70
+ else:
71
+ return f"""ନିମ୍ନରେ ଏକ ନିର୍ଦ୍ଦେଶ ଯାହାକି ଏକ କାର୍ଯ୍ୟକୁ ବର୍ଣ୍ଣନା କରେ | ଏକ ପ୍ରତିକ୍ରିୟା ଲେଖନ୍ତୁ ଯାହା ଅନୁରୋଧକୁ ସଠିକ୍ ଭାବରେ ସମାପ୍ତ କରେ |
72
+ ### ନିର୍ଦ୍ଦେଶ:
73
+ {instruction}
74
+ ### ପ୍ରତିକ୍ରିୟା:"""
75
+
76
+
77
+
78
+
79
+ if device != "cpu":
80
+ model.half()
81
+ model.eval()
82
+ if torch.__version__ >= "2":
83
+ model = torch.compile(model)
84
+
85
+
86
+ def evaluate(
87
+ instruction,
88
+ input=None,
89
+ temperature=0.1,
90
+ top_p=0.75,
91
+ top_k=40,
92
+ num_beams=4,
93
+ max_new_tokens=128,
94
+ **kwargs,
95
+ ):
96
+ prompt = generate_prompt(instruction, input)
97
+ print(prompt)
98
+ inputs = tokenizer(prompt, return_tensors="pt")
99
+ print(inputs)
100
+ input_ids = inputs["input_ids"].to(device)
101
+ print(input_ids)
102
+ generation_config = GenerationConfig(
103
+ temperature=temperature,
104
+ top_p=top_p,
105
+ top_k=top_k,
106
+ num_beams=num_beams,
107
+ **kwargs,
108
+ )
109
+ with torch.no_grad():
110
+ generation_output = model.generate(
111
+ input_ids=input_ids,
112
+ generation_config=generation_config,
113
+ return_dict_in_generate=True,
114
+ output_scores=True,
115
+ max_new_tokens=max_new_tokens,
116
+ )
117
+ print(generation_output)
118
+ s = generation_output.sequences[0]
119
+ print(s)
120
+ output = tokenizer.decode(s)
121
+ print(output)
122
+ return output.split("### ପ୍ରତିକ୍ରିୟା:")[1].strip()
123
+
124
+
125
+ g = gr.Interface(
126
+ fn=evaluate,
127
+ inputs=[
128
+ gr.components.Textbox(
129
+ lines=2, label="Instruction", placeholder="Tell me about alpacas."
130
+ ),
131
+ gr.components.Textbox(lines=2, label="Input", placeholder="none"),
132
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
133
+ gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
134
+ gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
135
+ gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
136
+ gr.components.Slider(
137
+ minimum=1, maximum=512, step=1, value=128, label="Max tokens"
138
+ ),
139
+ ],
140
+ outputs=[
141
+ gr.inputs.Textbox(
142
+ lines=5,
143
+ label="Output",
144
+ )
145
+ ],
146
+ title="🦙🌲 OdiaGenAI-LoRA",
147
+ description="OdiaGenAI-LoRA is a 7B-parameter LLaMA model fine-tuned to follow Odia instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) Odia translated dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/shantipriyap/OdiaGenAI).",
148
+ )
149
+ g.queue(concurrency_count=1)
150
+ g.launch()