jonACE commited on
Commit
53daa68
·
verified ·
1 Parent(s): a183e25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -35
app.py CHANGED
@@ -1,78 +1,91 @@
1
- import fitz # PyMuPDF
2
  import re
3
- from datasets import Dataset
4
- from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
5
- import gradio as gr
6
- from transformers import pipeline
7
-
8
 
9
  def extract_text_from_pdf(pdf_path):
10
  """Extract text from a PDF file"""
11
  doc = fitz.open(pdf_path)
12
- text = ""
13
- for page in doc:
14
- text += page.get_text("text") + "\n"
15
- return text
16
-
17
- pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
18
- #print(pdf_text[:1000]) # Preview first 1000 characters
19
 
20
  def preprocess_text(text):
21
- """Clean and preprocess text"""
22
- text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
23
- text = text.strip()
24
- return text
25
 
 
26
  clean_text = preprocess_text(pdf_text)
27
- #print(clean_text[:1000]) # Preview cleaned text
28
 
29
- # Create a dataset from text
30
- data = {"text": [clean_text]} # Single text entry
31
- dataset = Dataset.from_dict(data)
32
-
33
- # Tokenize text
34
  from transformers import AutoTokenizer
35
 
36
- model_name = "distilbert-base-uncased"
 
37
  tokenizer = AutoTokenizer.from_pretrained(model_name)
38
 
 
 
 
 
 
39
  def tokenize_function(examples):
40
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
41
- tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for unsupervised learning
42
  return tokens
43
 
44
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
45
 
46
- model = AutoModelForCausalLM.from_pretrained(model_name) # Adjust for task
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  training_args = TrainingArguments(
49
  output_dir="./results",
50
  evaluation_strategy="epoch",
51
- learning_rate=2e-5,
52
- per_device_train_batch_size=8,
53
- per_device_eval_batch_size=8,
54
  num_train_epochs=3,
55
  weight_decay=0.01,
56
  save_strategy="epoch",
 
 
57
  )
58
 
59
  trainer = Trainer(
60
  model=model,
61
  args=training_args,
62
  train_dataset=tokenized_datasets,
63
- eval_dataset=tokenized_datasets,
64
  tokenizer=tokenizer,
65
  )
66
 
67
  trainer.train()
68
 
69
- model.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB")
70
- tokenizer.save_pretrained("./distilbert-base-uncased-fine_tuned_model-NASB")
 
 
 
71
 
72
- classifier = pipeline("text-classification", model="./distilbert-base-uncased-fine_tuned_model-NASB")
73
 
74
- def chatbot_response(text):
75
- return classifier(text)
 
76
 
77
  iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
78
  iface.launch()
 
1
+ import fitz # PyMuPDF for PDF extraction
2
  import re
 
 
 
 
 
3
 
4
  def extract_text_from_pdf(pdf_path):
5
  """Extract text from a PDF file"""
6
  doc = fitz.open(pdf_path)
7
+ text = "\n".join([page.get_text("text") for page in doc])
8
+ return text.strip()
 
 
 
 
 
9
 
10
  def preprocess_text(text):
11
+ """Basic text preprocessing"""
12
+ return re.sub(r"\s+", " ", text).strip()
 
 
13
 
14
+ pdf_text = extract_text_from_pdf("your_document.pdf")
15
  clean_text = preprocess_text(pdf_text)
 
16
 
17
+ from datasets import Dataset
 
 
 
 
18
  from transformers import AutoTokenizer
19
 
20
+ model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
21
+
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
 
24
+ # Create dataset
25
+ data = {"text": [clean_text]}
26
+ dataset = Dataset.from_dict(data)
27
+
28
+ # Tokenization function
29
  def tokenize_function(examples):
30
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
31
+ tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
32
  return tokens
33
 
34
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
35
 
36
+ from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
37
+ from peft import LoraConfig, get_peft_model
38
+
39
+ # Load LLaMA 2 model in 4-bit mode to save memory
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_name,
42
+ load_in_4bit=True, # Use 4-bit quantization for efficiency
43
+ device_map="auto"
44
+ )
45
+
46
+ # Apply LoRA (efficient fine-tuning)
47
+ lora_config = LoraConfig(
48
+ r=8, # Low-rank parameter
49
+ lora_alpha=32,
50
+ target_modules=["q_proj", "v_proj"], # Applies only to attention layers
51
+ lora_dropout=0.05
52
+ )
53
+
54
+ model = get_peft_model(model, lora_config)
55
 
56
  training_args = TrainingArguments(
57
  output_dir="./results",
58
  evaluation_strategy="epoch",
59
+ learning_rate=2e-4,
60
+ per_device_train_batch_size=1, # Reduce batch size for memory efficiency
61
+ per_device_eval_batch_size=1,
62
  num_train_epochs=3,
63
  weight_decay=0.01,
64
  save_strategy="epoch",
65
+ logging_dir="./logs",
66
+ logging_steps=10,
67
  )
68
 
69
  trainer = Trainer(
70
  model=model,
71
  args=training_args,
72
  train_dataset=tokenized_datasets,
 
73
  tokenizer=tokenizer,
74
  )
75
 
76
  trainer.train()
77
 
78
+ model.save_pretrained("./fine_tuned_llama2")
79
+ tokenizer.save_pretrained("./fine_tuned_llama2")
80
+
81
+ import gradio as gr
82
+ from transformers import pipeline
83
 
84
+ chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
85
 
86
+ def chatbot_response(prompt):
87
+ result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
88
+ return result[0]["generated_text"]
89
 
90
  iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
91
  iface.launch()