jonACE's picture
Update app.py
53daa68 verified
raw
history blame
2.63 kB
import fitz # PyMuPDF for PDF extraction
import re
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("your_document.pdf")
clean_text = preprocess_text(pdf_text)
from datasets import Dataset
from transformers import AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
# Load LLaMA 2 model in 4-bit mode to save memory
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Use 4-bit quantization for efficiency
device_map="auto"
)
# Apply LoRA (efficient fine-tuning)
lora_config = LoraConfig(
r=8, # Low-rank parameter
lora_alpha=32,
target_modules=["q_proj", "v_proj"], # Applies only to attention layers
lora_dropout=0.05
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-4,
per_device_train_batch_size=1, # Reduce batch size for memory efficiency
per_device_eval_batch_size=1,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
import gradio as gr
from transformers import pipeline
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
def chatbot_response(prompt):
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
return result[0]["generated_text"]
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch()