|
import fitz |
|
import re |
|
from datasets import Dataset |
|
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""Extract text from a PDF file""" |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text("text") + "\n" |
|
return text |
|
|
|
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf") |
|
|
|
|
|
def preprocess_text(text): |
|
"""Clean and preprocess text""" |
|
text = re.sub(r'\s+', ' ', text) |
|
text = text.strip() |
|
return text |
|
|
|
clean_text = preprocess_text(pdf_text) |
|
|
|
|
|
|
|
data = {"text": [clean_text]} |
|
dataset = Dataset.from_dict(data) |
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
|
model_name = "distilbert-base-uncased" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
def tokenize_function(examples): |
|
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) |
|
tokens["labels"] = tokens["input_ids"].copy() |
|
return tokens |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
save_strategy="epoch", |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets, |
|
eval_dataset=tokenized_datasets, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
trainer.train() |
|
|
|
model.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB") |
|
tokenizer.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB") |
|
|
|
classifier = pipeline("text-classification", model="./distilbert-base-uncased-fine_tuned_model-NASB") |
|
|
|
def chatbot_response(text): |
|
return classifier(text) |
|
|
|
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text") |
|
iface.launch() |