erikbeltran commited on
Commit
ed1e9c8
·
verified ·
1 Parent(s): 013087b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -1,13 +1,16 @@
1
- import gradio as gr
2
  import spaces
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
4
  import torch
 
 
5
 
6
  # Initialize model and tokenizer
7
  MODEL_ID = "erikbeltran/pydiff"
8
  GGUF_FILE = "unsloth.Q4_K_M.gguf"
9
 
10
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, gguf_file=GGUF_FILE)
 
11
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, gguf_file=GGUF_FILE)
12
 
13
  # Move model to GPU if available
@@ -33,24 +36,26 @@ def create_prompt(request, file_content, system_message):
33
  <file>
34
  {file_content}
35
  </file>"""
36
-
37
  @spaces.GPU
38
  def respond(request, file_content, system_message, max_tokens, temperature, top_p):
39
  prompt = create_prompt(request, file_content, system_message)
40
 
41
  # Tokenize input
42
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
43
 
44
  # Generate response with streaming
45
  response = ""
46
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
47
 
48
  generation_kwargs = dict(
49
- inputs=inputs["input_ids"],
50
  max_new_tokens=max_tokens,
51
  temperature=temperature,
52
  top_p=top_p,
53
  streamer=streamer,
 
 
54
  )
55
 
56
  # Start generation in a separate thread
 
 
1
  import spaces
2
+ import gradio as gr
3
+ from transformers import LlamaTokenizer, AutoModelForCausalLM
4
  import torch
5
+ from threading import Thread
6
+ from transformers import TextIteratorStreamer
7
 
8
  # Initialize model and tokenizer
9
  MODEL_ID = "erikbeltran/pydiff"
10
  GGUF_FILE = "unsloth.Q4_K_M.gguf"
11
 
12
+ # Use LlamaTokenizer directly instead of AutoTokenizer
13
+ tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
14
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, gguf_file=GGUF_FILE)
15
 
16
  # Move model to GPU if available
 
36
  <file>
37
  {file_content}
38
  </file>"""
39
+
40
  @spaces.GPU
41
  def respond(request, file_content, system_message, max_tokens, temperature, top_p):
42
  prompt = create_prompt(request, file_content, system_message)
43
 
44
  # Tokenize input
45
+ inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(device)
46
 
47
  # Generate response with streaming
48
  response = ""
49
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
50
 
51
  generation_kwargs = dict(
52
+ input_ids=inputs["input_ids"],
53
  max_new_tokens=max_tokens,
54
  temperature=temperature,
55
  top_p=top_p,
56
  streamer=streamer,
57
+ pad_token_id=tokenizer.pad_token_id,
58
+ eos_token_id=tokenizer.eos_token_id,
59
  )
60
 
61
  # Start generation in a separate thread