erikbeltran commited on
Commit
c8ab0ef
·
verified ·
1 Parent(s): ed1e9c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -35
app.py CHANGED
@@ -1,21 +1,37 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import LlamaTokenizer, AutoModelForCausalLM
4
  import torch
5
  from threading import Thread
6
  from transformers import TextIteratorStreamer
 
7
 
8
  # Initialize model and tokenizer
9
  MODEL_ID = "erikbeltran/pydiff"
10
  GGUF_FILE = "unsloth.Q4_K_M.gguf"
11
 
12
- # Use LlamaTokenizer directly instead of AutoTokenizer
13
- tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
14
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, gguf_file=GGUF_FILE)
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Move model to GPU if available
17
- device = "cuda" if torch.cuda.is_available() else "cpu"
18
- model = model.to(device)
 
 
 
 
19
 
20
  def format_diff_response(response):
21
  """Format the response to look like a diff output"""
@@ -36,36 +52,48 @@ def create_prompt(request, file_content, system_message):
36
  <file>
37
  {file_content}
38
  </file>"""
39
-
40
  @spaces.GPU
41
  def respond(request, file_content, system_message, max_tokens, temperature, top_p):
42
- prompt = create_prompt(request, file_content, system_message)
43
-
44
- # Tokenize input
45
- inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(device)
46
-
47
- # Generate response with streaming
48
- response = ""
49
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
50
-
51
- generation_kwargs = dict(
52
- input_ids=inputs["input_ids"],
53
- max_new_tokens=max_tokens,
54
- temperature=temperature,
55
- top_p=top_p,
56
- streamer=streamer,
57
- pad_token_id=tokenizer.pad_token_id,
58
- eos_token_id=tokenizer.eos_token_id,
59
- )
60
-
61
- # Start generation in a separate thread
62
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
63
- thread.start()
64
-
65
- # Yield formatted responses as they're generated
66
- for new_text in streamer:
67
- response += new_text
68
- yield format_diff_response(response)
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Create the Gradio interface
71
  with gr.Blocks() as demo:
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM
4
  import torch
5
  from threading import Thread
6
  from transformers import TextIteratorStreamer
7
+ import os
8
 
9
  # Initialize model and tokenizer
10
  MODEL_ID = "erikbeltran/pydiff"
11
  GGUF_FILE = "unsloth.Q4_K_M.gguf"
12
 
13
+ try:
14
+ # Use PreTrainedTokenizerFast instead of LlamaTokenizer
15
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_ID)
16
+
17
+ # Ensure the tokenizer has the necessary special tokens
18
+ special_tokens = {
19
+ 'pad_token': '[PAD]',
20
+ 'eos_token': '</s>',
21
+ 'bos_token': '<s>',
22
+ 'unk_token': '<unk>'
23
+ }
24
+ tokenizer.add_special_tokens(special_tokens)
25
+
26
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, gguf_file=GGUF_FILE)
27
 
28
+ # Move model to GPU if available
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ model = model.to(device)
31
+
32
+ except Exception as e:
33
+ print(f"Error initializing model or tokenizer: {str(e)}")
34
+ raise
35
 
36
  def format_diff_response(response):
37
  """Format the response to look like a diff output"""
 
52
  <file>
53
  {file_content}
54
  </file>"""
55
+
56
  @spaces.GPU
57
  def respond(request, file_content, system_message, max_tokens, temperature, top_p):
58
+ try:
59
+ prompt = create_prompt(request, file_content, system_message)
60
+
61
+ # Tokenize input
62
+ inputs = tokenizer(
63
+ prompt,
64
+ return_tensors="pt",
65
+ add_special_tokens=True,
66
+ padding=True,
67
+ truncation=True,
68
+ max_length=2048
69
+ ).to(device)
70
+
71
+ # Generate response with streaming
72
+ response = ""
73
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
74
+
75
+ generation_kwargs = dict(
76
+ input_ids=inputs["input_ids"],
77
+ max_new_tokens=max_tokens,
78
+ temperature=temperature,
79
+ top_p=top_p,
80
+ streamer=streamer,
81
+ pad_token_id=tokenizer.pad_token_id,
82
+ eos_token_id=tokenizer.eos_token_id,
83
+ do_sample=True,
84
+ )
85
+
86
+ # Start generation in a separate thread
87
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
88
+ thread.start()
89
+
90
+ # Yield formatted responses as they're generated
91
+ for new_text in streamer:
92
+ response += new_text
93
+ yield format_diff_response(response)
94
+
95
+ except Exception as e:
96
+ yield f"<span style='color: red'>Error generating response: {str(e)}</span>"
97
 
98
  # Create the Gradio interface
99
  with gr.Blocks() as demo: