File size: 2,625 Bytes
53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce a183e25 53daa68 a183e25 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce 53daa68 02ad7ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import fitz # PyMuPDF for PDF extraction
import re
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("your_document.pdf")
clean_text = preprocess_text(pdf_text)
from datasets import Dataset
from transformers import AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
# Load LLaMA 2 model in 4-bit mode to save memory
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Use 4-bit quantization for efficiency
device_map="auto"
)
# Apply LoRA (efficient fine-tuning)
lora_config = LoraConfig(
r=8, # Low-rank parameter
lora_alpha=32,
target_modules=["q_proj", "v_proj"], # Applies only to attention layers
lora_dropout=0.05
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-4,
per_device_train_batch_size=1, # Reduce batch size for memory efficiency
per_device_eval_batch_size=1,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
import gradio as gr
from transformers import pipeline
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
def chatbot_response(prompt):
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
return result[0]["generated_text"]
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch() |