Ais commited on
Commit
6668ea3
·
verified ·
1 Parent(s): d123b85

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +35 -41
app/main.py CHANGED
@@ -1,55 +1,49 @@
1
- from fastapi import FastAPI, Request
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
  from peft import PeftModel
4
  import torch
5
- import os
6
  import gdown
 
 
 
 
 
 
 
7
 
8
- app = FastAPI()
 
 
9
 
10
- # Auto-download adapter from Google Drive (if not already present)
11
- ADAPTER_DIR = "adapter"
12
- ADAPTER_PATH = os.path.join(ADAPTER_DIR, "adapter_model.safetensors")
13
- DRIVE_FILE_ID = "1wnuE5t_m4ojI7YqxXZ8lBdtDFoHJJ6_H" # version 1 model
14
 
15
- if not os.path.exists(ADAPTER_PATH):
16
- os.makedirs(ADAPTER_DIR, exist_ok=True)
17
- gdown.download(f"https://drive.google.com/uc?id={DRIVE_FILE_ID}", ADAPTER_PATH, quiet=False)
18
 
19
- # Load base model
 
20
  base_model = AutoModelForCausalLM.from_pretrained(
21
- "Qwen/Qwen2-0.5B-Instruct",
 
22
  device_map="auto",
23
- torch_dtype=torch.float16
24
  )
25
 
26
- # Load tokenizer
27
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
28
-
29
- # Load LoRA adapter
30
- model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
31
- model.eval()
32
-
33
- @app.post("/chat")
34
- async def chat(request: Request):
35
- data = await request.json()
36
- prompt = data.get("prompt")
37
-
38
- if not prompt:
39
- return {"error": "No prompt provided."}
40
 
41
- full_prompt = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 
42
 
43
- inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
44
- with torch.no_grad():
45
- outputs = model.generate(
46
- **inputs,
47
- max_new_tokens=256,
48
- temperature=0.7,
49
- do_sample=True,
50
- top_p=0.9
51
- )
52
 
53
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
- response = response.split("<|im_start|>assistant\n")[-1].strip()
55
- return {"response": response}
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
2
  from peft import PeftModel
3
  import torch
 
4
  import gdown
5
+ import os
6
+ import zipfile
7
+
8
+ # Constants
9
+ BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
10
+ ADAPTER_FOLDER = "adapter"
11
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
12
 
13
+ # Step 1: Download adapter zip from Drive (version 1)
14
+ zip_url = "https://drive.google.com/uc?id=1z8U98kW9GD29t-3v8LDu0SsdqJ_vzNvQ" # Your .zip file link
15
+ zip_path = "adapter.zip"
16
 
17
+ if not os.path.exists(ADAPTER_FOLDER):
18
+ print("📥 Downloading adapter...")
19
+ gdown.download(zip_url, zip_path, quiet=False)
 
20
 
21
+ print("📂 Extracting adapter...")
22
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
23
+ zip_ref.extractall(ADAPTER_FOLDER)
24
 
25
+ # Step 2: Load base model (non-quantized, CPU-friendly)
26
+ print("🚀 Loading base model...")
27
  base_model = AutoModelForCausalLM.from_pretrained(
28
+ BASE_MODEL,
29
+ torch_dtype=torch.float16,
30
  device_map="auto",
31
+ token=HF_TOKEN
32
  )
33
 
34
+ # Step 3: Apply LoRA adapter
35
+ print("🔧 Applying LoRA adapter...")
36
+ model = PeftModel.from_pretrained(base_model, ADAPTER_FOLDER)
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Step 4: Load tokenizer
39
+ print("🧠 Loading tokenizer...")
40
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
41
 
42
+ # Step 5: Inference pipeline
43
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
44
 
45
+ # Step 6: Try a prompt
46
+ prompt = "What is the capital of India?"
47
+ print("💬 Prompt:", prompt)
48
+ output = pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
49
+ print("📤 Output:", output[0]["generated_text"])