Spaces:

Jimhugging
/

CogVLM2-4-Doc

Sleeping

App Files Files Community

CogVLM2-4-Doc / app.py

DoctorSlimm

Update app.py

a539d3b verified about 1 year ago

raw

history blame

4.64 kB

	import os
	import torch
	import spaces
	import gradio as gr
	from PIL import Image
	from transformers.utils import move_cache
	from huggingface_hub import snapshot_download
	from transformers import AutoModelForCausalLM, AutoTokenizer


	# Load the model and processor

	MODEL_PATH = "THUDM/cogvlm2-llama3-chat-19B"

	os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
	MODEL_PATH = snapshot_download(MODEL_PATH)
	move_cache()

	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True
	)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=TORCH_TYPE,
	trust_remote_code=True,
	).to(DEVICE).eval()


	@spaces.GPU
	def generate_caption(image, prompt):

	# Process the image and the prompt
	text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
	# inputs = processor(texts=[prompt], images=[image], return_tensors="pt").to('cuda') # move inputs to cuda



	return


	## make predictions via api ##
	# https://www.gradio.app/guides/getting-started-with-the-python-client#connecting-a-general-gradio-app

	demo = gr.Interface(
	fn=generate_caption,
	inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Prompt", value="Describe the image in great detail")],
	outputs=gr.Textbox(label="Generated Caption"),
	description=description
	)

	# Launch the interface
	demo.launch(share=True)



	####### ML CODE #######
	import torch
	from PIL import Image
	from transformers import AutoModelForCausalLM, AutoTokenizer

	MODEL_PATH = "THUDM/cogvlm2-llama3-chat-19B"
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True
	)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=TORCH_TYPE,
	trust_remote_code=True,
	).to(DEVICE).eval()

	text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"

	while True:
	image_path = input("image path >>>>> ")
	if image_path == '':
	print('You did not enter image path, the following will be a plain text conversation.')
	image = None
	text_only_first_query = True
	else:
	image = Image.open(image_path).convert('RGB')

	history = []

	while True:
	query = input("Human:")
	if query == "clear":
	break

	if image is None:
	if text_only_first_query:
	query = text_only_template.format(query)
	text_only_first_query = False
	else:
	old_prompt = ''
	for _, (old_query, response) in enumerate(history):
	old_prompt += old_query + " " + response + "\n"
	query = old_prompt + "USER: {} ASSISTANT:".format(query)
	if image is None:
	input_by_model = model.build_conversation_input_ids(
	tokenizer,
	query=query,
	history=history,
	template_version='chat'
	)
	else:
	input_by_model = model.build_conversation_input_ids(
	tokenizer,
	query=query,
	history=history,
	images=[image],
	template_version='chat'
	)
	inputs = {
	'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
	'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
	'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
	'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
	}
	gen_kwargs = {
	"max_new_tokens": 2048,
	"pad_token_id": 128002,
	}
	with torch.no_grad():
	outputs = model.generate(inputs, gen_kwargs)
	outputs = outputs[:, inputs['input_ids'].shape[1]:]
	response = tokenizer.decode(outputs[0])
	response = response.split("<\|end_of_text\|>")[0]
	print("\nCogVLM2:", response)
	history.append((query, response))