File size: 4,287 Bytes
53daa68 02ad7ce d352b02 11f9c5f d352b02 0d0b569 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 48fb300 02ad7ce fde5a32 015e26a fde5a32 d352b02 53daa68 11f9c5f 02ad7ce 53daa68 971d26b 53daa68 02ad7ce a183e25 53daa68 a183e25 02ad7ce 11f9c5f 02ad7ce 66ee08b 02ad7ce e7c9e58 53daa68 7a6b70d 13fe9dc 7a6b70d 13fe9dc 7a6b70d 13fe9dc 02ad7ce d352b02 710e3c3 02ad7ce 710e3c3 02ad7ce 710e3c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import fitz # PyMuPDF for PDF extraction
import re
import unsloth
import os
from huggingface_hub import login
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import gradio as gr
from transformers import pipeline
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from trl import SFTTrainer
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
clean_text = preprocess_text(pdf_text)
# Read the Hugging Face token from environment variables
hf_token = os.getenv("access_token")
if hf_token is None:
raise ValueError("'access_token' is not set. Add it as a secret variable in Hugging Face Spaces.")
# Log in to Hugging Face
login(token=hf_token)
#model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
model_name = "unsloth/llama-2-7b-chat"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=2048
)
model = FastLanguageModel.get_peft_model(model)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Set a padding token manually
tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
# Alternatively, add a new custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 60,
learning_rate = 2e-4,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
),
)
def perform_training():
trainer.train()
perform_training()
model.save_pretrained("./llama-2-7b-chat_fine_tuned")
tokenizer.save_pretrained("./llama-2-7b-chat_fine_tuned")
model.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token)
tokenizer.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token)
# save GGUF versions
model.save_pretrained_gguf("./llama-2-7b-chat_fine_tuned", tokenizer,)
model.push_to_hub_gguf("jonACE/llama-2-7b-chat_fine_tuned", tokenizer, token=hf_token)
model.save_pretrained_gguf("./llama-2-7b-chat_fine_tuned", tokenizer, quantization_method = "f16")
model.push_to_hub_gguf("jonACE/llama-2-7b-chat_fine_tuned", tokenizer, quantization_method = "f16", token=hf_token)
model.save_pretrained_gguf("./llama-2-7b-chat_fine_tuned", tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf("jonACE/llama-2-7b-chat_fine_tuned", tokenizer, quantization_method = "q4_k_m", token=hf_token)
# CHATBOT START
# chatbot = pipeline("text-generation", model="jonACE/llama-2-7b-chat_fine_tuned")
#def chatbot_response(prompt):
# result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
# return result[0]["generated_text"]
#iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
#iface.launch() |