Spaces:

jonACE
/

model-finetuning-with-own-data

Paused

App Files Files Community

model-finetuning-with-own-data / app.py

jonACE

Update app.py

53daa68 verified 24 days ago

raw

history blame

2.63 kB

	import fitz # PyMuPDF for PDF extraction
	import re

	def extract_text_from_pdf(pdf_path):
	"""Extract text from a PDF file"""
	doc = fitz.open(pdf_path)
	text = "\n".join([page.get_text("text") for page in doc])
	return text.strip()

	def preprocess_text(text):
	"""Basic text preprocessing"""
	return re.sub(r"\s+", " ", text).strip()

	pdf_text = extract_text_from_pdf("your_document.pdf")
	clean_text = preprocess_text(pdf_text)

	from datasets import Dataset
	from transformers import AutoTokenizer

	model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Create dataset
	data = {"text": [clean_text]}
	dataset = Dataset.from_dict(data)

	# Tokenization function
	def tokenize_function(examples):
	tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
	tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
	return tokens

	tokenized_datasets = dataset.map(tokenize_function, batched=True)

	from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
	from peft import LoraConfig, get_peft_model

	# Load LLaMA 2 model in 4-bit mode to save memory
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	load_in_4bit=True, # Use 4-bit quantization for efficiency
	device_map="auto"
	)

	# Apply LoRA (efficient fine-tuning)
	lora_config = LoraConfig(
	r=8, # Low-rank parameter
	lora_alpha=32,
	target_modules=["q_proj", "v_proj"], # Applies only to attention layers
	lora_dropout=0.05
	)

	model = get_peft_model(model, lora_config)

	training_args = TrainingArguments(
	output_dir="./results",
	evaluation_strategy="epoch",
	learning_rate=2e-4,
	per_device_train_batch_size=1, # Reduce batch size for memory efficiency
	per_device_eval_batch_size=1,
	num_train_epochs=3,
	weight_decay=0.01,
	save_strategy="epoch",
	logging_dir="./logs",
	logging_steps=10,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets,
	tokenizer=tokenizer,
	)

	trainer.train()

	model.save_pretrained("./fine_tuned_llama2")
	tokenizer.save_pretrained("./fine_tuned_llama2")

	import gradio as gr
	from transformers import pipeline

	chatbot = pipeline("text-generation", model="./fine_tuned_llama2")

	def chatbot_response(prompt):
	result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
	return result[0]["generated_text"]

	iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
	iface.launch()