|
import fitz |
|
import re |
|
import unsloth |
|
import os |
|
from huggingface_hub import login |
|
from datasets import Dataset |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer |
|
from peft import LoraConfig, get_peft_model |
|
import gradio as gr |
|
from transformers import pipeline |
|
from unsloth import FastLanguageModel, is_bfloat16_supported |
|
import torch |
|
from trl import SFTTrainer |
|
|
|
max_seq_length = 2048 |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""Extract text from a PDF file""" |
|
doc = fitz.open(pdf_path) |
|
text = "\n".join([page.get_text("text") for page in doc]) |
|
return text.strip() |
|
|
|
def preprocess_text(text): |
|
"""Basic text preprocessing""" |
|
return re.sub(r"\s+", " ", text).strip() |
|
|
|
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf") |
|
clean_text = preprocess_text(pdf_text) |
|
|
|
|
|
|
|
hf_token = os.getenv("access_token") |
|
|
|
if hf_token is None: |
|
raise ValueError("'access_token' is not set. Add it as a secret variable in Hugging Face Spaces.") |
|
|
|
|
|
login(token=hf_token) |
|
|
|
|
|
model_name = "unsloth/llama-2-7b-chat" |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_name, |
|
max_seq_length=2048 |
|
) |
|
|
|
model = FastLanguageModel.get_peft_model(model) |
|
|
|
|
|
data = {"text": [clean_text]} |
|
dataset = Dataset.from_dict(data) |
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) |
|
tokens["labels"] = tokens["input_ids"].copy() |
|
return tokens |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
train_dataset = dataset, |
|
dataset_text_field = "text", |
|
max_seq_length = max_seq_length, |
|
dataset_num_proc = 2, |
|
packing = False, |
|
args = TrainingArguments( |
|
per_device_train_batch_size = 2, |
|
gradient_accumulation_steps = 4, |
|
warmup_steps = 5, |
|
|
|
max_steps = 60, |
|
learning_rate = 2e-4, |
|
fp16 = not is_bfloat16_supported(), |
|
bf16 = is_bfloat16_supported(), |
|
logging_steps = 1, |
|
optim = "adamw_8bit", |
|
weight_decay = 0.01, |
|
lr_scheduler_type = "linear", |
|
seed = 3407, |
|
output_dir = "outputs", |
|
report_to = "none", |
|
), |
|
) |
|
|
|
def perform_training(): |
|
trainer.train() |
|
|
|
perform_training() |
|
|
|
model.save_pretrained("./llama-2-7b-chat_fine_tuned") |
|
tokenizer.save_pretrained("./llama-2-7b-chat_fine_tuned") |
|
|
|
model.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token) |
|
tokenizer.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token) |
|
|
|
|
|
|
|
chatbot = pipeline("text-generation", model="jonACE/llama-2-7b-chat_fine_tuned") |
|
|
|
def chatbot_response(prompt): |
|
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7) |
|
return result[0]["generated_text"] |
|
|
|
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text") |
|
iface.launch() |