Spaces:
Runtime error
Runtime error
Commit
Β·
13f6fc8
1
Parent(s):
499cad2
init
Browse files- app.py +70 -0
- panda_gpt.py +116 -0
- requirements.txt +22 -0
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from panda_gpt import PandaGPT
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
|
6 |
+
vicuna_path = hf_hub_download(repo_id="ningshanwutuobang/ggml-pandagpt-vicuna-merge", filename="ggml-pandagpt-vicuna-q4_1.bin")
|
7 |
+
panda_path = hf_hub_download(repo_id="openllmplayground/pandagpt_13b_max_len_400", filename="pytorch_model.pt")
|
8 |
+
|
9 |
+
a = PandaGPT((vicuna_path,))
|
10 |
+
a.load_projection(panda_path)
|
11 |
+
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
|
15 |
+
def add_text(history, text):
|
16 |
+
history = history + [(text, None)]
|
17 |
+
return history, gr.update(value="", interactive=False)
|
18 |
+
|
19 |
+
def add_file(history, file):
|
20 |
+
history = history + [((file.name,), None)]
|
21 |
+
return history
|
22 |
+
|
23 |
+
|
24 |
+
def bot(history):
|
25 |
+
text = history[-1][0]
|
26 |
+
image_paths = []
|
27 |
+
audio_paths = []
|
28 |
+
video_paths = []
|
29 |
+
for i in history[:-1]:
|
30 |
+
if i[1] is None:
|
31 |
+
if i[0][:4] in [".png", "jpeg"]:
|
32 |
+
image_paths += list(i[0])
|
33 |
+
if i[0][:3] in ["mp3", "wav"]:
|
34 |
+
audio_paths += list(i[0])
|
35 |
+
if i[0][:3] in ["mp4", "avi", "mkv"]:
|
36 |
+
video_paths += list(i[0])
|
37 |
+
else:
|
38 |
+
image_paths = []
|
39 |
+
audio_paths = []
|
40 |
+
video_paths = []
|
41 |
+
if len(image_paths) == 0 and len(audio_paths) == 0 and len(video_paths) == 0:
|
42 |
+
response = a.chat(text)
|
43 |
+
else:
|
44 |
+
response = a.chat_with_image({"image_paths": image_paths,"audio_paths": audio_paths, "video_paths": video_paths}, text)
|
45 |
+
history[-1][1] = response[:-3]
|
46 |
+
return history
|
47 |
+
|
48 |
+
|
49 |
+
with gr.Blocks() as demo:
|
50 |
+
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=750)
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
with gr.Column(scale=0.85):
|
54 |
+
txt = gr.Textbox(
|
55 |
+
show_label=False,
|
56 |
+
placeholder="Enter text and press enter, or upload an image",
|
57 |
+
).style(container=False)
|
58 |
+
with gr.Column(scale=0.15, min_width=0):
|
59 |
+
btn = gr.UploadButton("π", file_types=["image", "video", "audio"])
|
60 |
+
|
61 |
+
txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
|
62 |
+
bot, chatbot, chatbot
|
63 |
+
)
|
64 |
+
txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
|
65 |
+
file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)
|
66 |
+
|
67 |
+
demo.launch()
|
68 |
+
|
69 |
+
|
70 |
+
# a.chat_with
|
panda_gpt.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import llama_cpp
|
2 |
+
import os, sys
|
3 |
+
from ctypes import POINTER, c_float
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
|
7 |
+
# use PandaGPT path
|
8 |
+
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
|
9 |
+
imagebind_ckpt_path = os.path.join(os.path.dirname(__file__), "imagebind_huge.pth")
|
10 |
+
|
11 |
+
if not os.path.exists(panda_gpt_path):
|
12 |
+
os.system("git clone https://github.com/yxuansu/PandaGPT "+panda_gpt_path)
|
13 |
+
|
14 |
+
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
|
15 |
+
from ImageBind.models import imagebind_model
|
16 |
+
from ImageBind import data
|
17 |
+
|
18 |
+
def numpy_to_floatptr(x):
|
19 |
+
return x.astype(np.float32).ctypes.data_as(POINTER(c_float))
|
20 |
+
|
21 |
+
class PandaGPT:
|
22 |
+
def __init__(self, args=(), kwargs={}):
|
23 |
+
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=os.path.dirname(imagebind_ckpt_path))
|
24 |
+
self.visual_encoder.eval()
|
25 |
+
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
|
26 |
+
self.max_tgt_len = 400
|
27 |
+
self.model = llama_cpp.Llama(*args, **kwargs)
|
28 |
+
self.generated_text = ""
|
29 |
+
self.device = "cpu"
|
30 |
+
|
31 |
+
def eval_embd(self, x):
|
32 |
+
y = numpy_to_floatptr(x.T)
|
33 |
+
ctx = self.model.ctx
|
34 |
+
n_past = self.model.n_tokens
|
35 |
+
n_threads = self.model.n_threads
|
36 |
+
llama_cpp.llama_eval_embd(ctx, y, x.shape[0], n_past, n_threads)
|
37 |
+
self.model.n_tokens += x.shape[0]
|
38 |
+
|
39 |
+
def eval_string(self, s):
|
40 |
+
s = self.model.tokenize(s.encode())
|
41 |
+
self.model.eval(s)
|
42 |
+
|
43 |
+
def generate_with_print(self, end="###"):
|
44 |
+
end = end.encode()
|
45 |
+
ret = b""
|
46 |
+
for i in range(self.max_tgt_len):
|
47 |
+
token = self.model.sample()
|
48 |
+
self.model.eval([token])
|
49 |
+
txt = self.model.detokenize([token])
|
50 |
+
ret += txt
|
51 |
+
print(txt.decode(errors="replace"), flush=True, end="")
|
52 |
+
if ret.endswith(end):
|
53 |
+
break
|
54 |
+
return ret.decode(errors="replace")
|
55 |
+
|
56 |
+
def load_projection(self, path):
|
57 |
+
state = torch.load(path, map_location="cpu")
|
58 |
+
self.llama_proj.load_state_dict({
|
59 |
+
"weight": state["llama_proj.weight"],
|
60 |
+
"bias": state["llama_proj.bias"]})
|
61 |
+
|
62 |
+
def eval_inputs(self, inputs):
|
63 |
+
self.eval_string("<Img>")
|
64 |
+
embds = self.extract_multimoal_feature(inputs)
|
65 |
+
for i in embds:
|
66 |
+
self.eval_embd(i)
|
67 |
+
self.eval_string("</Img> ")
|
68 |
+
|
69 |
+
def chat(self, question):
|
70 |
+
return self.chat_with_image(None, question)
|
71 |
+
|
72 |
+
def chat_with_image(self, inputs, question):
|
73 |
+
if self.generated_text == "":
|
74 |
+
self.eval_string("###")
|
75 |
+
self.eval_string(" Human: ")
|
76 |
+
if inputs:
|
77 |
+
self.eval_inputs(inputs)
|
78 |
+
self.eval_string(question)
|
79 |
+
self.eval_string("\n### Assistant:")
|
80 |
+
ret = self.generate_with_print(end="###")
|
81 |
+
self.generated_text += ret
|
82 |
+
return ret
|
83 |
+
|
84 |
+
def extract_multimoal_feature(self, inputs):
|
85 |
+
features = []
|
86 |
+
for key in ["image", "audio", "video", "thermal"]:
|
87 |
+
if key + "_paths" in inputs:
|
88 |
+
embeds = self.encode_data(key, inputs[key+"_paths"])
|
89 |
+
features.append(embeds)
|
90 |
+
return features
|
91 |
+
|
92 |
+
def encode_data(self, data_type, data_paths):
|
93 |
+
|
94 |
+
type_map = {
|
95 |
+
"image": ModalityType.VISION,
|
96 |
+
"audio": ModalityType.AUDIO,
|
97 |
+
"video": ModalityType.VISION,
|
98 |
+
"thermal": ModalityType.THERMAL,
|
99 |
+
}
|
100 |
+
load_map = {
|
101 |
+
"image": data.load_and_transform_vision_data,
|
102 |
+
"audio": data.load_and_transform_audio_data,
|
103 |
+
"video": data.load_and_transform_video_data,
|
104 |
+
"thermal": data.load_and_transform_thermal_data
|
105 |
+
}
|
106 |
+
|
107 |
+
load_function = load_map[data_type]
|
108 |
+
key = type_map[data_type]
|
109 |
+
|
110 |
+
inputs = {key: load_function(data_paths, self.device)}
|
111 |
+
with torch.no_grad():
|
112 |
+
embeddings = self.visual_encoder(inputs)
|
113 |
+
embeds = embeddings[key]
|
114 |
+
embeds = self.llama_proj(embeds).cpu().numpy()
|
115 |
+
return embeds
|
116 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
timm==0.6.7
|
2 |
+
deepspeed==0.9.2
|
3 |
+
data
|
4 |
+
einops==0.6.1
|
5 |
+
ftfy==6.1.1
|
6 |
+
iopath==0.1.10
|
7 |
+
ipdb==0.13.13
|
8 |
+
numpy==1.24.3
|
9 |
+
peft==0.3.0
|
10 |
+
Pillow==9.5.0
|
11 |
+
PyYAML==6.0
|
12 |
+
regex==2022.10.31
|
13 |
+
torchvision==0.14.1
|
14 |
+
torchaudio==0.13.1
|
15 |
+
pytorchvideo
|
16 |
+
fvcore
|
17 |
+
decord==0.6.0
|
18 |
+
tqdm==4.64.1
|
19 |
+
transformers==4.29.1
|
20 |
+
llama-cpp-python>=0.1.67
|
21 |
+
gradio
|
22 |
+
huggingface_hub
|