Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,14 @@
|
|
1 |
import fitz # PyMuPDF for PDF extraction
|
2 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def extract_text_from_pdf(pdf_path):
|
5 |
"""Extract text from a PDF file"""
|
@@ -14,8 +23,6 @@ def preprocess_text(text):
|
|
14 |
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
|
15 |
clean_text = preprocess_text(pdf_text)
|
16 |
|
17 |
-
import os
|
18 |
-
from huggingface_hub import login
|
19 |
|
20 |
# Read the Hugging Face token from environment variables
|
21 |
hf_token = os.getenv("access_token")
|
@@ -26,12 +33,8 @@ if hf_token is None:
|
|
26 |
# Log in to Hugging Face
|
27 |
login(token=hf_token)
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
import unsloth
|
32 |
-
|
33 |
-
|
34 |
-
model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
|
35 |
|
36 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
37 |
|
@@ -45,7 +48,6 @@ tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
|
|
45 |
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
46 |
|
47 |
# Tokenization function
|
48 |
-
@unsloth.optimize
|
49 |
def tokenize_function(examples):
|
50 |
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
51 |
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
|
@@ -53,9 +55,6 @@ def tokenize_function(examples):
|
|
53 |
|
54 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
55 |
|
56 |
-
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
|
57 |
-
from peft import LoraConfig, get_peft_model
|
58 |
-
|
59 |
# Load LLaMA 2 model in 4-bit mode to save memory
|
60 |
model = AutoModelForCausalLM.from_pretrained(
|
61 |
model_name,
|
@@ -103,9 +102,8 @@ perform_training()
|
|
103 |
model.save_pretrained("./fine_tuned_llama2")
|
104 |
tokenizer.save_pretrained("./fine_tuned_llama2")
|
105 |
|
106 |
-
import gradio as gr
|
107 |
-
from transformers import pipeline
|
108 |
|
|
|
109 |
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
|
110 |
|
111 |
def chatbot_response(prompt):
|
|
|
1 |
import fitz # PyMuPDF for PDF extraction
|
2 |
import re
|
3 |
+
import unsloth
|
4 |
+
import os
|
5 |
+
from huggingface_hub import login
|
6 |
+
from datasets import Dataset
|
7 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
8 |
+
from peft import LoraConfig, get_peft_model
|
9 |
+
import gradio as gr
|
10 |
+
from transformers import pipeline
|
11 |
+
|
12 |
|
13 |
def extract_text_from_pdf(pdf_path):
|
14 |
"""Extract text from a PDF file"""
|
|
|
23 |
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
|
24 |
clean_text = preprocess_text(pdf_text)
|
25 |
|
|
|
|
|
26 |
|
27 |
# Read the Hugging Face token from environment variables
|
28 |
hf_token = os.getenv("access_token")
|
|
|
33 |
# Log in to Hugging Face
|
34 |
login(token=hf_token)
|
35 |
|
36 |
+
#model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
|
37 |
+
model_name = "unsloth/llama-2-7b-chat"
|
|
|
|
|
|
|
|
|
38 |
|
39 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
40 |
|
|
|
48 |
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
49 |
|
50 |
# Tokenization function
|
|
|
51 |
def tokenize_function(examples):
|
52 |
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
53 |
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
|
|
|
55 |
|
56 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
57 |
|
|
|
|
|
|
|
58 |
# Load LLaMA 2 model in 4-bit mode to save memory
|
59 |
model = AutoModelForCausalLM.from_pretrained(
|
60 |
model_name,
|
|
|
102 |
model.save_pretrained("./fine_tuned_llama2")
|
103 |
tokenizer.save_pretrained("./fine_tuned_llama2")
|
104 |
|
|
|
|
|
105 |
|
106 |
+
# CHATBOT START
|
107 |
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
|
108 |
|
109 |
def chatbot_response(prompt):
|