Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import spaces | |
| import os | |
| import gc | |
| import random | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import numpy as np | |
| import pandas as pd | |
| pd.set_option("display.max_rows", 500) | |
| pd.set_option("display.max_columns", 500) | |
| pd.set_option("display.width", 1000) | |
| from tqdm.auto import tqdm | |
| import torch | |
| import torch.nn as nn | |
| import tokenizers | |
| import transformers | |
| print(f"tokenizers.__version__: {tokenizers.__version__}") | |
| print(f"transformers.__version__: {transformers.__version__}") | |
| print(f"torch.__version__: {torch.__version__}") | |
| print(f"torch cuda version: {torch.version.cuda}") | |
| from transformers import AutoTokenizer, AutoConfig | |
| from transformers import BitsAndBytesConfig, AutoModelForCausalLM, MistralForCausalLM | |
| from peft import LoraConfig, get_peft_model | |
| title = "H2O AI Predict the LLM" | |
| description =" The objective of this [competition](https://www.kaggle.com/competitions/h2oai-predict-the-llm) was to \ | |
| detect which out of 7 possible LLM models produced a particular response. \n\n\ | |
| This demo is utilizing finetuned HuggingFaceH4/zephyr-7b-beta model for a multiclass classification task. \n\n \ | |
| We ranked 3rd out of more than 100 participants and our team's solution is [here](https://www.kaggle.com/competitions/h2oai-predict-the-llm/discussion/453728)" | |
| title = title + "\n" + description | |
| #Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py | |
| theme = gr.themes.Monochrome( | |
| primary_hue="indigo", | |
| secondary_hue="blue", | |
| neutral_hue="slate", | |
| radius_size=gr.themes.sizes.radius_sm, | |
| font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"], | |
| ) | |
| ### Load the model | |
| class CFG: | |
| num_workers = os.cpu_count() | |
| llm_backbone = "save_pretrained_model/zephyr-7b-beta" | |
| tokenizer_path = "HuggingFaceH4/zephyr-7b-beta" | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True | |
| ) | |
| batch_size = 1 | |
| max_len = 650 | |
| seed = 42 | |
| num_labels = 7 | |
| lora = True | |
| lora_r = 4 | |
| lora_alpha = 16 | |
| lora_dropout = 0.05 | |
| lora_target_modules = "" | |
| gradient_checkpointing = True | |
| class CustomModel(nn.Module): | |
| """ | |
| Model for causal language modeling problem type. | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| self.backbone_config = AutoConfig.from_pretrained( | |
| CFG.llm_backbone, trust_remote_code=True | |
| ) | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| CFG.llm_backbone, | |
| config=self.backbone_config, | |
| quantization_config=quantization_config, | |
| ) | |
| if CFG.lora: | |
| target_modules = [] | |
| for name, module in self.model.named_modules(): | |
| if ( | |
| isinstance(module, (torch.nn.Linear, torch.nn.Conv1d)) | |
| and "head" not in name | |
| ): | |
| name = name.split(".")[-1] | |
| if name not in target_modules: | |
| target_modules.append(name) | |
| lora_config = LoraConfig( | |
| r=CFG.lora_r, | |
| lora_alpha=CFG.lora_alpha, | |
| target_modules=target_modules, | |
| lora_dropout=CFG.lora_dropout, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| if CFG.gradient_checkpointing: | |
| self.model.enable_input_require_grads() | |
| self.model = get_peft_model(self.model, lora_config) | |
| self.model.print_trainable_parameters() | |
| self.classification_head = nn.Linear( | |
| self.backbone_config.vocab_size, CFG.num_labels, bias=False | |
| ) | |
| self._init_weights(self.classification_head) | |
| def _init_weights(self, module): | |
| if isinstance(module, nn.Linear): | |
| module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range) | |
| if module.bias is not None: | |
| module.bias.data.zero_() | |
| elif isinstance(module, nn.Embedding): | |
| module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range) | |
| if module.padding_idx is not None: | |
| module.weight.data[module.padding_idx].zero_() | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| def forward( | |
| self, | |
| batch | |
| ): | |
| # disable cache if gradient checkpointing is enabled | |
| if CFG.gradient_checkpointing: | |
| self.model.config.use_cache = False | |
| self.model.config.pretraining_tp = 1 | |
| output = self.model( | |
| input_ids=batch["input_ids"], | |
| attention_mask=batch["attention_mask"], | |
| ) | |
| output.logits = self.classification_head(output[0][:, -1].float()) | |
| # enable cache again if gradient checkpointing is enabled | |
| if CFG.gradient_checkpointing: | |
| self.model.config.use_cache = True | |
| return output.logits | |
| model = CustomModel() | |
| ### End Load the model | |
| def do_inference(full_text): | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model_paths = [ | |
| 'model_finetuned/HuggingFaceH4-zephyr-7b-beta_fold0_best.pth'] | |
| # config_path = ("/home/rashmi/Documents/kaggle/h2oai_predict_llm/src/models_exp56/config.pth") | |
| def prepare_input(cfg, text): | |
| inputs = cfg.tokenizer.encode_plus( | |
| text, | |
| return_tensors=None, | |
| add_special_tokens=True, | |
| max_length=CFG.max_len, | |
| pad_to_max_length=True, | |
| truncation="longest_first", | |
| ) | |
| for k, v in inputs.items(): | |
| inputs[k] = torch.tensor(v, dtype=torch.long) | |
| return inputs | |
| # model = CustomModel() | |
| state = torch.load(model_paths[0], map_location=torch.device("cpu")) | |
| model.load_state_dict(state["model"] ,strict=False) | |
| model.eval() | |
| model.to(device) | |
| inputs = prepare_input(CFG, full_text) | |
| inputs["input_ids"] = inputs["input_ids"].reshape(1, -1).to(device) | |
| inputs["attention_mask"] = inputs["attention_mask"].reshape(1, -1).to(device) | |
| with torch.no_grad(): | |
| with torch.cuda.amp.autocast( | |
| enabled=True, dtype=torch.float16, cache_enabled=True | |
| ): | |
| y_preds = model(inputs) | |
| y_preds = y_preds.detach().to("cpu").numpy().astype(np.float32) | |
| y_preds= torch.softmax(torch.tensor(y_preds), 1).numpy() | |
| result = np.argmax(y_preds) | |
| if result == 0: | |
| return "0. llama2-70b-chat" | |
| elif result == 1: | |
| return "1. wizardLM-13b" | |
| elif result == 2: | |
| return "2. llama2-13b-chat" | |
| elif result == 3: | |
| return "3. wizardLM-70b" | |
| elif result == 4: | |
| return "4. llama2-7b-chat" | |
| elif result == 5: | |
| return "5. tinyllama-1b-chat" | |
| elif result == 6: | |
| return "6. mistral-7b-openorca" | |
| else: | |
| return "Error" | |
| def do_submit(question, response): | |
| full_text = question + " " + response | |
| result = do_inference(full_text) | |
| return result | |
| def greet(): | |
| pass | |
| with gr.Blocks(title=title) as demo: # theme=theme | |
| sample_examples = pd.read_csv('sample_examples.csv') | |
| example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist() | |
| gr.Markdown(f"## {title}") | |
| with gr.Row(): | |
| # with gr.Column(scale=1): | |
| # gr.Markdown("### Question and LLM Response") | |
| question_text = gr.Textbox(lines=2, placeholder="Question:", label="") | |
| response_text = gr.Textbox(lines=2, placeholder="Response:", label="") | |
| target_text = gr.Textbox(lines=1, placeholder="Target:", label="", interactive=False , visible=False) | |
| llm_num = gr.Textbox(value="", label="LLM #") | |
| with gr.Row(): | |
| sub_btn = gr.Button("Submit") | |
| sub_btn.click(fn=do_submit, inputs=[question_text, response_text], outputs=[llm_num]) | |
| gr.Markdown("## Sample Inputs:") | |
| gr.Examples( | |
| example_list, | |
| [question_text,response_text,target_text], | |
| # cache_examples=True, | |
| ) | |
| demo.launch(greet) |