jonACE commited on
Commit
02ad7ce
·
verified ·
1 Parent(s): 3f5fc84

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import re
3
+ from datasets import Dataset
4
+ from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
5
+ import gradio as gr
6
+ from transformers import pipeline
7
+
8
+
9
+ def extract_text_from_pdf(pdf_path):
10
+ """Extract text from a PDF file"""
11
+ doc = fitz.open(pdf_path)
12
+ text = ""
13
+ for page in doc:
14
+ text += page.get_text("text") + "\n"
15
+ return text
16
+
17
+ pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
18
+ #print(pdf_text[:1000]) # Preview first 1000 characters
19
+
20
+ def preprocess_text(text):
21
+ """Clean and preprocess text"""
22
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
23
+ text = text.strip()
24
+ return text
25
+
26
+ clean_text = preprocess_text(pdf_text)
27
+ #print(clean_text[:1000]) # Preview cleaned text
28
+
29
+ # Create a dataset from text
30
+ data = {"text": [clean_text]} # Single text entry
31
+ dataset = Dataset.from_dict(data)
32
+
33
+ # Tokenize text
34
+ from transformers import AutoTokenizer
35
+
36
+ model_name = "distilbert-base-uncased"
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+
39
+ def tokenize_function(examples):
40
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
41
+
42
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
43
+
44
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Adjust for task
45
+
46
+ training_args = TrainingArguments(
47
+ output_dir="./results",
48
+ evaluation_strategy="epoch",
49
+ learning_rate=2e-5,
50
+ per_device_train_batch_size=8,
51
+ per_device_eval_batch_size=8,
52
+ num_train_epochs=3,
53
+ weight_decay=0.01,
54
+ save_strategy="epoch",
55
+ )
56
+
57
+ trainer = Trainer(
58
+ model=model,
59
+ args=training_args,
60
+ train_dataset=tokenized_datasets,
61
+ eval_dataset=tokenized_datasets,
62
+ tokenizer=tokenizer,
63
+ )
64
+
65
+ trainer.train()
66
+
67
+ model.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB")
68
+ tokenizer.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB")
69
+
70
+ classifier = pipeline("text-classification", model="./distilbert-base-uncased-fine_tuned_model-NASB")
71
+
72
+ def chatbot_response(text):
73
+ return classifier(text)
74
+
75
+ iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
76
+ iface.launch()