Lahiru Menikdiwela
load model in 4bit and bfloat16 computying type with pipeline op check
638094e
raw
history blame
1.44 kB
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import LEDForConditionalGeneration, LEDTokenizer
from langchain_openai import OpenAI
# from huggingface_hub import login
from dotenv import load_dotenv
from logging import getLogger
# import streamlit as st
import torch
load_dotenv()
hf_token = os.environ.get("HF_TOKEN")
# # hf_token = st.secrets["HF_TOKEN"]
# login(token=hf_token)
logger = getLogger(__name__)
device = "cuda" if torch.cuda.is_available() else "cpu"
def get_local_model(model_name_or_path:str)->pipeline:
#print(f"Model is running on {device}")
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
token = hf_token
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
load_in_4bit = True,
token = hf_token
)
pipe = pipeline(
task = "text-generation",
model=model,
tokenizer=tokenizer,
device = device,
)
logger.info(f"Summarization pipeline created and loaded to {device}")
return pipe
def get_endpoint(api_key:str):
llm = OpenAI(openai_api_key=api_key)
return llm
def get_model(model_type,model_name_or_path,api_key = None):
if model_type == "openai":
return get_endpoint(api_key)
else:
return get_local_model(model_name_or_path)