import fitz # PyMuPDF import re from datasets import Dataset from transformers import AutoModelForCausalLM, TrainingArguments, Trainer import gradio as gr from transformers import pipeline def extract_text_from_pdf(pdf_path): """Extract text from a PDF file""" doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text("text") + "\n" return text pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf") #print(pdf_text[:1000]) # Preview first 1000 characters def preprocess_text(text): """Clean and preprocess text""" text = re.sub(r'\s+', ' ', text) # Remove extra whitespace text = text.strip() return text clean_text = preprocess_text(pdf_text) #print(clean_text[:1000]) # Preview cleaned text # Create a dataset from text data = {"text": [clean_text]} # Single text entry dataset = Dataset.from_dict(data) # Tokenize text from transformers import AutoTokenizer model_name = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_function(examples): tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for unsupervised learning return tokens tokenized_datasets = dataset.map(tokenize_function, batched=True) model = AutoModelForCausalLM.from_pretrained(model_name) # Adjust for task training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, save_strategy="epoch", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, eval_dataset=tokenized_datasets, tokenizer=tokenizer, ) trainer.train() model.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB") tokenizer.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB") classifier = pipeline("text-classification", model="./distilbert-base-uncased-fine_tuned_model-NASB") def chatbot_response(text): return classifier(text) iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text") iface.launch()