|
import os |
|
import torch |
|
from torch.utils.data import Dataset |
|
from transformers import ( |
|
AutoConfig, |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
Trainer, |
|
TrainingArguments, |
|
GenerationConfig, |
|
pipeline |
|
) |
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
class MyTextDataset(Dataset): |
|
""" |
|
Very simple dataset example. In reality: |
|
- Use real text data, |
|
- Possibly use HF 'datasets' library, |
|
- Tokenize in chunks, etc. |
|
""" |
|
def __init__(self, tokenizer, texts, block_size=128): |
|
self.examples = [] |
|
for txt in texts: |
|
|
|
tokens = tokenizer(txt, truncation=True, max_length=block_size) |
|
self.examples.append(tokens["input_ids"]) |
|
|
|
def __len__(self): |
|
return len(self.examples) |
|
|
|
def __getitem__(self, idx): |
|
return torch.tensor(self.examples[idx], dtype=torch.long) |
|
|
|
|
|
|
|
|
|
|
|
def train_model( |
|
model_name_or_path="wuhp/myr1", |
|
subfolder="myr1", |
|
output_dir="finetuned_myr1", |
|
epochs=1 |
|
): |
|
""" |
|
Demonstrates how to load your custom model from HF, and run a |
|
quick 'Trainer' to finetune it on some mock texts. |
|
|
|
- model_name_or_path: huggingface repo ID (or local folder). |
|
- subfolder: if your model config/weights live in a subfolder |
|
within that repo, specify it here. |
|
- output_dir: where to save final trained model. |
|
- epochs: how many epochs for this mock training example. |
|
""" |
|
|
|
|
|
config = AutoConfig.from_pretrained( |
|
model_name_or_path, |
|
subfolder=subfolder, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name_or_path, |
|
subfolder=subfolder, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name_or_path, |
|
subfolder=subfolder, |
|
config=config, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
train_texts = [ |
|
"Hello from DeepSeek!", |
|
"The sky is blue.", |
|
"Large language models can do amazing things." |
|
] |
|
eval_texts = [ |
|
"Testing is essential for robust code.", |
|
"Generative AI is fun." |
|
] |
|
train_dataset = MyTextDataset(tokenizer, train_texts) |
|
eval_dataset = MyTextDataset(tokenizer, eval_texts) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
overwrite_output_dir=True, |
|
num_train_epochs=epochs, |
|
per_device_train_batch_size=1, |
|
per_device_eval_batch_size=1, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
logging_steps=1, |
|
gradient_accumulation_steps=1, |
|
fp16=True if torch.cuda.is_available() else False, |
|
|
|
) |
|
|
|
|
|
from transformers import DataCollatorForLanguageModeling |
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=False |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
return trainer |
|
|
|
|
|
|
|
|
|
|
|
def create_gradio_demo( |
|
model_name_or_path="finetuned_myr1", |
|
generation_config_path=None |
|
): |
|
""" |
|
Loads a (fine-tuned) model from local or HF, sets up |
|
a text-generation pipeline, and returns a Gradio interface. |
|
""" |
|
|
|
|
|
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name_or_path, |
|
config=config, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if generation_config_path: |
|
gen_config = GenerationConfig.from_json_file(generation_config_path) |
|
else: |
|
|
|
gen_config = GenerationConfig.from_model_config(config) |
|
|
|
|
|
text_pipeline = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
generation_config=gen_config, |
|
) |
|
|
|
|
|
def predict(prompt, max_new_tokens=64, temperature=0.7, top_p=0.95): |
|
""" |
|
Generates text from the model given a user prompt. |
|
""" |
|
outputs = text_pipeline( |
|
prompt, |
|
max_new_tokens=int(max_new_tokens), |
|
temperature=float(temperature), |
|
top_p=float(top_p) |
|
) |
|
|
|
return outputs[0]["generated_text"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## DeepSeek LLM Demo") |
|
prompt = gr.Textbox(label="Enter your prompt:") |
|
max_new_tokens = gr.Slider(1, 512, step=1, value=64, label="Max New Tokens") |
|
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature") |
|
top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.95, label="Top-p") |
|
output = gr.Textbox(label="Generated Text") |
|
|
|
generate_btn = gr.Button("Generate") |
|
generate_btn.click( |
|
fn=predict, |
|
inputs=[prompt, max_new_tokens, temperature, top_p], |
|
outputs=output |
|
) |
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("Starting mock training on wuhp/myr1 (subfolder myr1)...") |
|
trainer = train_model( |
|
model_name_or_path="wuhp/myr1", |
|
subfolder="myr1", |
|
output_dir="finetuned_myr1", |
|
epochs=1 |
|
) |
|
print("Training complete.") |
|
|
|
|
|
|
|
|
|
demo = create_gradio_demo( |
|
model_name_or_path="finetuned_myr1", |
|
generation_config_path=None |
|
) |
|
|
|
|
|
print("Launching Gradio demo on http://127.0.0.1:7860 ...") |
|
demo.launch() |
|
|