Spaces:
Running
Running
"""Defines Pricer agent using fine-tuned LLaMA on Modal.""" | |
import logging | |
import os | |
from typing import Any | |
import modal | |
from src.modal_services.app_config import CACHE_PATH, app, modal_class_kwargs | |
from src.utils.text_utils import extract_tagged_price | |
logging.basicConfig(level=logging.INFO) | |
# Model identifiers | |
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B" | |
FINETUNED_MODEL = "ed-donner/pricer-2024-09-13_13.04.39" | |
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36" | |
# Local model paths in volume | |
BASE_MODEL_DIR = f"{CACHE_PATH}/llama_base_model" | |
FINETUNED_MODEL_DIR = f"{CACHE_PATH}/llama_finetuned_model" | |
QUESTION = "How much does this cost to the nearest dollar?" | |
PREFIX = "Price is $" | |
class FTPricer: | |
"""Remote pricing with LLaMA, PEFT, and 4-bit quantization.""" | |
def _build_prompt(description: str) -> str: | |
return f"{QUESTION}\n\n{description}\n\n{PREFIX}" | |
def _generate_output( | |
model: Any, # noqa: ANN401 | |
inputs: dict, | |
tokenizer: Any, # noqa: ANN401 | |
) -> str: | |
"""Generate output from model.""" | |
import torch | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=5, num_return_sequences=1) | |
return tokenizer.decode(outputs[0]) | |
def _download_models() -> None: | |
from huggingface_hub import snapshot_download | |
snapshot_download(BASE_MODEL, local_dir=BASE_MODEL_DIR) | |
snapshot_download( | |
FINETUNED_MODEL, revision=REVISION, local_dir=FINETUNED_MODEL_DIR | |
) | |
def _load_tokenizer(self) -> None: | |
from transformers import AutoTokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR) | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
self.tokenizer.padding_side = "right" | |
logging.info("Tokenizer loaded.") | |
def _load_models(self) -> None: | |
import torch | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, BitsAndBytesConfig | |
base_model = AutoModelForCausalLM.from_pretrained( | |
BASE_MODEL_DIR, | |
quantization_config=BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
bnb_4bit_quant_type="nf4", | |
), | |
device_map="auto", | |
) | |
self.fine_tuned_model = PeftModel.from_pretrained( | |
base_model, FINETUNED_MODEL_DIR, revision=REVISION | |
) | |
self.fine_tuned_model.eval() | |
gen_config = self.fine_tuned_model.generation_config | |
gen_config.pad_token_id = self.tokenizer.pad_token_id | |
gen_config.eos_token_id = self.tokenizer.eos_token_id | |
logging.info("Models loaded.") | |
def setup(self) -> None: | |
"""Load base and fine-tuned models with tokenizer and quantization.""" | |
try: | |
os.makedirs(CACHE_PATH, exist_ok=True) | |
self._download_models() | |
logging.info("Base and fine-tuned models downloaded.") | |
self._load_tokenizer() | |
self._load_models() | |
except Exception as e: | |
logging.error(f"[FTPricer] Setup failed: {e}") | |
raise RuntimeError("[FTPricer] Model setup failed") from e | |
def price(self, description: str) -> float: | |
"""Generate a price estimate based on a product description.""" | |
from transformers import set_seed | |
try: | |
set_seed(42) | |
logging.info("[FTPricer] Generating price...") | |
prompt = self._build_prompt(description) | |
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True).to( | |
"cuda" | |
) | |
result = self._generate_output( | |
self.fine_tuned_model, inputs, self.tokenizer | |
) | |
price = extract_tagged_price(result) | |
logging.info(f"[FTPricer] Predicted price: {price}") | |
return price | |
except Exception as e: | |
logging.error(f"[FTPricer] Prediction failed: {e}") | |
return 0.0 | |