Spaces:
Runtime error
Runtime error
from peft import PeftModel | |
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig | |
import torch | |
n_gpus = torch.cuda.device_count() | |
max_memory = {i: max_memory for i in range(n_gpus)} | |
print(f'Max memory : {max_memory}') | |
tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf") | |
max_memory = '40GB' | |
model = LLaMAForCausalLM.from_pretrained( | |
"decapoda-research/llama-7b-hf", | |
load_in_8bit=True, | |
device_map="auto",max_memory=max_memory | |
) | |
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b") | |
def generate_prompt(instruction, input=None): | |
if input: | |
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. Answer step by step. | |
### Instruction: | |
{instruction} | |
### Input: | |
{input} | |
### Response:""" | |
else: | |
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. Answer step by step. | |
### Instruction: | |
{instruction} | |
### Response:""" | |
generation_config = GenerationConfig( | |
temperature=0.1, | |
top_p=0.75, | |
num_beams=4, | |
) | |
def evaluate(instruction, input=None): | |
prompt = generate_prompt(instruction, input) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].cuda() | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=256 | |
) | |
for s in generation_output.sequences: | |
output = tokenizer.decode(s) | |
print("Response:", output.split("### Response:")[1].strip()) | |
import gradio as gr | |
from peft import PeftModel | |
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig | |
import gradio as gr | |
def evaluate1(instruction): | |
prompt = generate_prompt(instruction) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].cuda() | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=256 | |
) | |
for s in generation_output.sequences: | |
output = tokenizer.decode(s) | |
return output.split("### Response:")[1].strip() | |
inputs = gr.inputs.Textbox(lines=5, label="Instruction") | |
outputs = gr.outputs.Textbox(label="Response") | |
title = "LLaMA-7B Language Model" | |
description = "This is a LLaMA-7B language model fine-tuned on various text datasets to generate text for a given task. It was trained on PyTorch by and is capable of generating high-quality, coherent text that is similar to human writing. The model is highly versatile and can be used for a variety of tasks, including text completion, summarization, and translation." | |
copyright = "Copyright Bhaskar Tripathi (2023)" | |
gr.Interface(evaluate1, inputs, outputs, title=title, description=description, footer=copyright, flag=False).launch() | |