File size: 3,258 Bytes
53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 48fb300 02ad7ce 015e26a fde5a32 015e26a fde5a32 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 971d26b 53daa68 02ad7ce a183e25 53daa68 a183e25 02ad7ce 53daa68 1d41ae1 fb7c227 53daa68 02ad7ce e46c5c1 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import fitz # PyMuPDF for PDF extraction
import re
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
clean_text = preprocess_text(pdf_text)
import os
from huggingface_hub import login
# Read the Hugging Face token from environment variables
hf_token = os.getenv("access_token")
if hf_token is None:
raise ValueError("'access_token' is not set. Add it as a secret variable in Hugging Face Spaces.")
# Log in to Hugging Face
login(token=hf_token)
from datasets import Dataset
from transformers import AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Set a padding token manually
tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
# Alternatively, add a new custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
# Load LLaMA 2 model in 4-bit mode to save memory
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Use 4-bit quantization for efficiency
device_map="auto"
#device_map="cpu",
quantization_config=None
)
# Apply LoRA (efficient fine-tuning)
lora_config = LoraConfig(
r=8, # Low-rank parameter
lora_alpha=32,
target_modules=["q_proj", "v_proj"], # Applies only to attention layers
lora_dropout=0.05
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="no", # Disable evaluation (to enable, change value to 'epoch')
learning_rate=2e-4,
per_device_train_batch_size=1, # Reduce batch size for memory efficiency
per_device_eval_batch_size=1,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
import gradio as gr
from transformers import pipeline
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
def chatbot_response(prompt):
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
return result[0]["generated_text"]
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch() |