VanguardAI commited on
Commit
67b1882
·
verified ·
1 Parent(s): d5262d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -29
app.py CHANGED
@@ -4,9 +4,21 @@ import re
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import gradio as gr
6
  import os
7
- READ_HF = os.environ["read_hf"]
8
  from unsloth import FastLanguageModel
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
11
 
12
  ### Instruction:
@@ -65,43 +77,63 @@ Category List : ["Dairy & Eggs", "Beverages & Snacks", "Cleaning & Hygiene", "Gr
65
 
66
  @spaces.GPU()
67
  def chunk_it(inventory_list, user_input_text):
68
- print("Loading model and tokenizer...")
69
- model, tokenizer = FastLanguageModel.from_pretrained(
70
- model_name = "VanguardAI/CoT_multi_llama_LoRA_4bit",
71
- max_seq_length = 2048,
72
- dtype = torch.bfloat16,
73
- load_in_4bit = True,
74
- token = READ_HF
75
- )
76
- print("Model and tokenizer loaded.")
77
-
78
- print("Enabling native 2x faster inference...")
79
- FastLanguageModel.for_inference(model)
80
- print("Inference enabled.")
 
 
 
 
 
 
 
 
81
 
82
  formatted_prompt = alpaca_prompt.format(
83
  string + inventory_list, # instruction
84
  user_input_text, # input
85
  "", # output - leave this blank for generation!
86
  )
87
- print("Formatted prompt: ", formatted_prompt)
88
-
89
- inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
90
- print("Tokenized inputs: ", inputs)
91
-
92
- print("Generating output...")
93
- outputs = model.generate(**inputs, max_new_tokens=216, use_cache=True)
94
- print("Output generated.")
95
-
96
- reply = tokenizer.batch_decode(outputs, skip_special_tokens=True)
97
- print("Decoded output: ", reply)
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Uncomment the following lines if further processing of the reply is needed
100
  # pattern = r"### Response:\n(.*?)<\|end_of_text\|>"
101
  # match = re.search(pattern, reply[0], re.DOTALL)
102
  # reply = match.group(1).strip()
103
 
104
- print("Final reply: ", reply)
105
  return reply
106
 
107
  # Interface for inputs
@@ -115,6 +147,9 @@ iface = gr.Interface(
115
  title="Testing",
116
  )
117
 
118
- print("Launching Gradio interface...")
119
- iface.launch(inline=False)
120
- print("Gradio interface launched.")
 
 
 
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import gradio as gr
6
  import os
7
+ import logging
8
  from unsloth import FastLanguageModel
9
 
10
+ # Set up logging
11
+ logging.basicConfig(
12
+ level=logging.DEBUG, # Set the logging level to DEBUG to capture all messages
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
14
+ handlers=[
15
+ logging.StreamHandler() # Logs will be output to the console
16
+ ]
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+ READ_HF = os.environ["read_hf"]
21
+
22
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
23
 
24
  ### Instruction:
 
77
 
78
  @spaces.GPU()
79
  def chunk_it(inventory_list, user_input_text):
80
+ logger.info("Loading model and tokenizer...")
81
+ try:
82
+ model, tokenizer = FastLanguageModel.from_pretrained(
83
+ model_name = "VanguardAI/CoT_multi_llama_LoRA_4bit",
84
+ max_seq_length = 2048,
85
+ dtype = torch.bfloat16,
86
+ load_in_4bit = True,
87
+ token = READ_HF
88
+ )
89
+ logger.info("Model and tokenizer loaded.")
90
+ except Exception as e:
91
+ logger.error(f"Failed to load model and tokenizer: {e}")
92
+ raise
93
+
94
+ logger.info("Enabling native 2x faster inference...")
95
+ try:
96
+ FastLanguageModel.for_inference(model)
97
+ logger.info("Inference enabled.")
98
+ except Exception as e:
99
+ logger.error(f"Failed to enable native inference: {e}")
100
+ raise
101
 
102
  formatted_prompt = alpaca_prompt.format(
103
  string + inventory_list, # instruction
104
  user_input_text, # input
105
  "", # output - leave this blank for generation!
106
  )
107
+ logger.debug(f"Formatted prompt: {formatted_prompt}")
108
+
109
+ try:
110
+ inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
111
+ logger.debug(f"Tokenized inputs: {inputs}")
112
+ except Exception as e:
113
+ logger.error(f"Failed to tokenize inputs: {e}")
114
+ raise
115
+
116
+ logger.info("Generating output...")
117
+ try:
118
+ outputs = model.generate(**inputs, max_new_tokens=216, use_cache=True)
119
+ logger.info("Output generated.")
120
+ except Exception as e:
121
+ logger.error(f"Failed to generate output: {e}")
122
+ raise
123
+
124
+ try:
125
+ reply = tokenizer.batch_decode(outputs, skip_special_tokens=True)
126
+ logger.debug(f"Decoded output: {reply}")
127
+ except Exception as e:
128
+ logger.error(f"Failed to decode output: {e}")
129
+ raise
130
 
131
  # Uncomment the following lines if further processing of the reply is needed
132
  # pattern = r"### Response:\n(.*?)<\|end_of_text\|>"
133
  # match = re.search(pattern, reply[0], re.DOTALL)
134
  # reply = match.group(1).strip()
135
 
136
+ logger.debug(f"Final reply: {reply}")
137
  return reply
138
 
139
  # Interface for inputs
 
147
  title="Testing",
148
  )
149
 
150
+ logger.info("Launching Gradio interface...")
151
+ try:
152
+ iface.launch(inline=False)
153
+ logger.info("Gradio interface launched.")
154
+ except Exception as e:
155
+ logger.error(f"Failed to launch Gradio interface: {e}")