dragonjump commited on
Commit
9559a52
·
1 Parent(s): 8f52987
Files changed (2) hide show
  1. main.py +55 -21
  2. main.py.old +78 -0
main.py CHANGED
@@ -1,34 +1,56 @@
1
  from fastapi import FastAPI, Query
2
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
 
 
 
3
  from qwen_vl_utils import process_vision_info
4
  import torch
 
 
 
5
 
6
  app = FastAPI()
7
 
8
- checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct"
9
- min_pixels = 256*28*28
10
- max_pixels = 1280*28*28
 
 
11
  processor = AutoProcessor.from_pretrained(
12
- checkpoint,
13
  min_pixels=min_pixels,
14
- max_pixels=max_pixels
15
  )
16
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
17
- checkpoint,
 
18
  torch_dtype=torch.bfloat16,
19
  device_map="auto",
20
- # attn_implementation="flash_attention_2",
 
 
 
 
 
 
21
  )
22
 
23
  @app.get("/")
24
  def read_root():
25
- return {"message": "API is live. Use the /predict endpoint."}
26
 
27
  @app.get("/predict")
28
  def predict(image_url: str = Query(...), prompt: str = Query(...)):
29
  messages = [
30
  {"role": "system", "content": "You are a helpful assistant with vision abilities."},
31
- {"role": "user", "content": [{"type": "image", "image": image_url}, {"type": "text", "text": prompt}]},
 
 
 
 
 
 
32
  ]
33
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
34
  image_inputs, video_inputs = process_vision_info(messages)
@@ -38,31 +60,43 @@ def predict(image_url: str = Query(...), prompt: str = Query(...)):
38
  videos=video_inputs,
39
  padding=True,
40
  return_tensors="pt",
41
- ).to(model.device)
42
  with torch.no_grad():
43
- generated_ids = model.generate(**inputs, max_new_tokens=128)
44
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 
 
45
  output_texts = processor.batch_decode(
46
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
47
  )
48
  return {"response": output_texts[0]}
49
 
50
  @app.get("/chat")
51
- def chat( prompt: str = Query(...)):
52
  messages = [
53
- {"role": "system", "content": "You are a helpful assistant with vision abilities."},
54
- {"role": "user", "content": [ {"type": "text", "text": prompt}]},
55
  ]
56
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
57
  inputs = processor(
58
- text=[text],
59
  padding=True,
60
  return_tensors="pt",
61
- ).to(model.device)
62
  with torch.no_grad():
63
- generated_ids = model.generate(**inputs, max_new_tokens=128)
64
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 
 
65
  output_texts = processor.batch_decode(
66
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
67
  )
68
  return {"response": output_texts[0]}
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, Query
2
+ from transformers import (
3
+ AutoProcessor,
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ )
7
  from qwen_vl_utils import process_vision_info
8
  import torch
9
+ import logging
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
 
13
  app = FastAPI()
14
 
15
+ # Qwen2.5-VL Model Setup
16
+ qwen_checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct"
17
+ min_pixels = 256 * 28 * 28
18
+ max_pixels = 1280 * 28 * 28
19
+
20
  processor = AutoProcessor.from_pretrained(
21
+ qwen_checkpoint,
22
  min_pixels=min_pixels,
23
+ max_pixels=max_pixels,
24
  )
25
+
26
+ qwen_model = AutoModelForCausalLM.from_pretrained(
27
+ qwen_checkpoint,
28
  torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
+ )
31
+
32
+ # LLaMA Model Setup
33
+ llama_model_name = "path/to/llama-uncensored-model"
34
+ llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
35
+ llama_model = AutoModelForCausalLM.from_pretrained(
36
+ llama_model_name, torch_dtype=torch.float16, device_map="auto"
37
  )
38
 
39
  @app.get("/")
40
  def read_root():
41
+ return {"message": "API is live. Use the /predict, /chat, or /llama_chat endpoints."}
42
 
43
  @app.get("/predict")
44
  def predict(image_url: str = Query(...), prompt: str = Query(...)):
45
  messages = [
46
  {"role": "system", "content": "You are a helpful assistant with vision abilities."},
47
+ {
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "image", "image": image_url},
51
+ {"type": "text", "text": prompt},
52
+ ],
53
+ },
54
  ]
55
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
56
  image_inputs, video_inputs = process_vision_info(messages)
 
60
  videos=video_inputs,
61
  padding=True,
62
  return_tensors="pt",
63
+ ).to(qwen_model.device)
64
  with torch.no_grad():
65
+ generated_ids = qwen_model.generate(**inputs, max_new_tokens=128)
66
+ generated_ids_trimmed = [
67
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
68
+ ]
69
  output_texts = processor.batch_decode(
70
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
71
  )
72
  return {"response": output_texts[0]}
73
 
74
  @app.get("/chat")
75
+ def chat(prompt: str = Query(...)):
76
  messages = [
77
+ {"role": "system", "content": "You are a helpful assistant."},
78
+ {"role": "user", "content": [{"type": "text", "text": prompt}]},
79
  ]
80
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
81
  inputs = processor(
82
+ text=[text],
83
  padding=True,
84
  return_tensors="pt",
85
+ ).to(qwen_model.device)
86
  with torch.no_grad():
87
+ generated_ids = qwen_model.generate(**inputs, max_new_tokens=128)
88
+ generated_ids_trimmed = [
89
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
90
+ ]
91
  output_texts = processor.batch_decode(
92
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
93
  )
94
  return {"response": output_texts[0]}
95
+
96
+ @app.get("/llama_chat")
97
+ def llama_chat(prompt: str = Query(...)):
98
+ inputs = llama_tokenizer(prompt, return_tensors="pt").to(llama_model.device)
99
+ with torch.no_grad():
100
+ outputs = llama_model.generate(**inputs, max_new_tokens=128)
101
+ response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
102
+ return {"response": response}
main.py.old ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query
2
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ import torch
5
+ import logging
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+ try:
10
+ # Code that may raise an exception
11
+ x = 1 / 0
12
+ except ZeroDivisionError as e:
13
+ logging.error("Error occurred: %s", e)
14
+ # Take alternative action to recover from the exception
15
+
16
+ app = FastAPI()
17
+
18
+ checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct"
19
+ min_pixels = 256*28*28
20
+ max_pixels = 1280*28*28
21
+ processor = AutoProcessor.from_pretrained(
22
+ checkpoint,
23
+ min_pixels=min_pixels,
24
+ max_pixels=max_pixels
25
+ )
26
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
27
+ checkpoint,
28
+ torch_dtype=torch.bfloat16,
29
+ device_map="auto",
30
+ # attn_implementation="flash_attention_2",
31
+ )
32
+
33
+ @app.get("/")
34
+ def read_root():
35
+ return {"message": "API is live. Use the /predict endpoint."}
36
+
37
+ @app.get("/predict")
38
+ def predict(image_url: str = Query(...), prompt: str = Query(...)):
39
+ messages = [
40
+ {"role": "system", "content": "You are a helpful assistant with vision abilities."},
41
+ {"role": "user", "content": [{"type": "image", "image": image_url}, {"type": "text", "text": prompt}]},
42
+ ]
43
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
+ image_inputs, video_inputs = process_vision_info(messages)
45
+ inputs = processor(
46
+ text=[text],
47
+ images=image_inputs,
48
+ videos=video_inputs,
49
+ padding=True,
50
+ return_tensors="pt",
51
+ ).to(model.device)
52
+ with torch.no_grad():
53
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
54
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
55
+ output_texts = processor.batch_decode(
56
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
57
+ )
58
+ return {"response": output_texts[0]}
59
+
60
+ @app.get("/chat")
61
+ def chat( prompt: str = Query(...)):
62
+ messages = [
63
+ {"role": "system", "content": "You are a helpful assistant with vision abilities."},
64
+ {"role": "user", "content": [ {"type": "text", "text": prompt}]},
65
+ ]
66
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
67
+ inputs = processor(
68
+ text=[text],
69
+ padding=True,
70
+ return_tensors="pt",
71
+ ).to(model.device)
72
+ with torch.no_grad():
73
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
74
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
75
+ output_texts = processor.batch_decode(
76
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
77
+ )
78
+ return {"response": output_texts[0]}