jonACE commited on
Commit
d352b02
·
verified ·
1 Parent(s): 8ddc99a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -1,5 +1,14 @@
1
  import fitz # PyMuPDF for PDF extraction
2
  import re
 
 
 
 
 
 
 
 
 
3
 
4
  def extract_text_from_pdf(pdf_path):
5
  """Extract text from a PDF file"""
@@ -14,8 +23,6 @@ def preprocess_text(text):
14
  pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
15
  clean_text = preprocess_text(pdf_text)
16
 
17
- import os
18
- from huggingface_hub import login
19
 
20
  # Read the Hugging Face token from environment variables
21
  hf_token = os.getenv("access_token")
@@ -26,12 +33,8 @@ if hf_token is None:
26
  # Log in to Hugging Face
27
  login(token=hf_token)
28
 
29
- from datasets import Dataset
30
- from transformers import AutoTokenizer
31
- import unsloth
32
-
33
-
34
- model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
35
 
36
  tokenizer = AutoTokenizer.from_pretrained(model_name)
37
 
@@ -45,7 +48,6 @@ tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
45
  # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
46
 
47
  # Tokenization function
48
- @unsloth.optimize
49
  def tokenize_function(examples):
50
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
51
  tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
@@ -53,9 +55,6 @@ def tokenize_function(examples):
53
 
54
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
55
 
56
- from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
57
- from peft import LoraConfig, get_peft_model
58
-
59
  # Load LLaMA 2 model in 4-bit mode to save memory
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
@@ -103,9 +102,8 @@ perform_training()
103
  model.save_pretrained("./fine_tuned_llama2")
104
  tokenizer.save_pretrained("./fine_tuned_llama2")
105
 
106
- import gradio as gr
107
- from transformers import pipeline
108
 
 
109
  chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
110
 
111
  def chatbot_response(prompt):
 
1
  import fitz # PyMuPDF for PDF extraction
2
  import re
3
+ import unsloth
4
+ import os
5
+ from huggingface_hub import login
6
+ from datasets import Dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
8
+ from peft import LoraConfig, get_peft_model
9
+ import gradio as gr
10
+ from transformers import pipeline
11
+
12
 
13
  def extract_text_from_pdf(pdf_path):
14
  """Extract text from a PDF file"""
 
23
  pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
24
  clean_text = preprocess_text(pdf_text)
25
 
 
 
26
 
27
  # Read the Hugging Face token from environment variables
28
  hf_token = os.getenv("access_token")
 
33
  # Log in to Hugging Face
34
  login(token=hf_token)
35
 
36
+ #model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
37
+ model_name = "unsloth/llama-2-7b-chat"
 
 
 
 
38
 
39
  tokenizer = AutoTokenizer.from_pretrained(model_name)
40
 
 
48
  # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
49
 
50
  # Tokenization function
 
51
  def tokenize_function(examples):
52
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
53
  tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
 
55
 
56
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
57
 
 
 
 
58
  # Load LLaMA 2 model in 4-bit mode to save memory
59
  model = AutoModelForCausalLM.from_pretrained(
60
  model_name,
 
102
  model.save_pretrained("./fine_tuned_llama2")
103
  tokenizer.save_pretrained("./fine_tuned_llama2")
104
 
 
 
105
 
106
+ # CHATBOT START
107
  chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
108
 
109
  def chatbot_response(prompt):