jonACE's picture
Update app.py
e7c9e58 verified
raw
history blame
3.66 kB
import fitz # PyMuPDF for PDF extraction
import re
import unsloth
import os
from huggingface_hub import login
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import gradio as gr
from transformers import pipeline
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from trl import SFTTrainer
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
clean_text = preprocess_text(pdf_text)
# Read the Hugging Face token from environment variables
hf_token = os.getenv("access_token")
if hf_token is None:
raise ValueError("'access_token' is not set. Add it as a secret variable in Hugging Face Spaces.")
# Log in to Hugging Face
login(token=hf_token)
#model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
model_name = "unsloth/llama-2-7b-chat"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=2048
)
model = FastLanguageModel.get_peft_model(model)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Set a padding token manually
tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
# Alternatively, add a new custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 60,
learning_rate = 2e-4,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
),
)
def perform_training():
trainer.train()
perform_training()
model.save_pretrained("./llama-2-7b-chat_fine_tuned")
tokenizer.save_pretrained("./llama-2-7b-chat_fine_tuned")
model.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token)
tokenizer.push_to_hub("jonACE/llama-2-7b-chat_fine_tuned", token=hf_token)
# CHATBOT START
chatbot = pipeline("text-generation", model="jonACE/llama-2-7b-chat_fine_tuned")
def chatbot_response(prompt):
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
return result[0]["generated_text"]
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch()