|
import fitz |
|
import re |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""Extract text from a PDF file""" |
|
doc = fitz.open(pdf_path) |
|
text = "\n".join([page.get_text("text") for page in doc]) |
|
return text.strip() |
|
|
|
def preprocess_text(text): |
|
"""Basic text preprocessing""" |
|
return re.sub(r"\s+", " ", text).strip() |
|
|
|
pdf_text = extract_text_from_pdf("your_document.pdf") |
|
clean_text = preprocess_text(pdf_text) |
|
|
|
from datasets import Dataset |
|
from transformers import AutoTokenizer |
|
|
|
model_name = "meta-llama/Llama-2-7b-hf" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
data = {"text": [clean_text]} |
|
dataset = Dataset.from_dict(data) |
|
|
|
|
|
def tokenize_function(examples): |
|
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) |
|
tokens["labels"] = tokens["input_ids"].copy() |
|
return tokens |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
|
from peft import LoraConfig, get_peft_model |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_4bit=True, |
|
device_map="auto" |
|
) |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=8, |
|
lora_alpha=32, |
|
target_modules=["q_proj", "v_proj"], |
|
lora_dropout=0.05 |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-4, |
|
per_device_train_batch_size=1, |
|
per_device_eval_batch_size=1, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
save_strategy="epoch", |
|
logging_dir="./logs", |
|
logging_steps=10, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
trainer.train() |
|
|
|
model.save_pretrained("./fine_tuned_llama2") |
|
tokenizer.save_pretrained("./fine_tuned_llama2") |
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
chatbot = pipeline("text-generation", model="./fine_tuned_llama2") |
|
|
|
def chatbot_response(prompt): |
|
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7) |
|
return result[0]["generated_text"] |
|
|
|
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text") |
|
iface.launch() |