arya-ai-model commited on
Commit
59e3ffd
·
1 Parent(s): bb37662

fixing app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -13
app.py CHANGED
@@ -1,26 +1,28 @@
1
  import os
2
-
3
- # Set a writable cache directory
4
- os.environ["HF_HOME"] = "/tmp/huggingface"
5
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
6
-
7
- # Now import the required libraries
8
  import torch
9
  from fastapi import FastAPI, HTTPException
10
  from pydantic import BaseModel
11
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
12
 
 
 
 
 
13
  # Model setup
14
  MODEL_NAME = "deepseek-ai/deepseek-llm-7b-base"
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
16
 
17
  # Load model and tokenizer
18
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
19
  model = AutoModelForCausalLM.from_pretrained(
20
- MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto"
21
  )
22
- model.generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
23
- model.generation_config.pad_token_id = model.generation_config.eos_token_id
 
 
 
24
 
25
  # FastAPI app
26
  app = FastAPI()
@@ -28,14 +30,29 @@ app = FastAPI()
28
  # Request payload
29
  class TextGenerationRequest(BaseModel):
30
  prompt: str
31
- max_tokens: int = 100
32
 
33
  @app.post("/generate")
34
  async def generate_text(request: TextGenerationRequest):
35
  try:
36
- inputs = tokenizer(request.prompt, return_tensors="pt").to(DEVICE)
37
- outputs = model.generate(**inputs, max_new_tokens=request.max_tokens)
38
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  return {"generated_text": result}
 
40
  except Exception as e:
41
  raise HTTPException(status_code=500, detail=str(e))
 
1
  import os
 
 
 
 
 
 
2
  import torch
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
6
 
7
+ # Set a writable cache directory
8
+ os.environ["HF_HOME"] = "/tmp/huggingface"
9
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
10
+
11
  # Model setup
12
  MODEL_NAME = "deepseek-ai/deepseek-llm-7b-base"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+ DTYPE = torch.float16 if DEVICE == "cuda" else torch.bfloat16
15
 
16
  # Load model and tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
18
  model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_NAME, torch_dtype=DTYPE, device_map="auto"
20
  )
21
+
22
+ # Set up generation config
23
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
24
+ generation_config.pad_token_id = generation_config.eos_token_id
25
+ generation_config.use_cache = True # Speed up decoding
26
 
27
  # FastAPI app
28
  app = FastAPI()
 
30
  # Request payload
31
  class TextGenerationRequest(BaseModel):
32
  prompt: str
33
+ max_tokens: int = 512 # Default to 512 for better performance
34
 
35
  @app.post("/generate")
36
  async def generate_text(request: TextGenerationRequest):
37
  try:
38
+ # Tokenize input and move tensors to the correct device
39
+ inputs = tokenizer(request.prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
40
+
41
+ # Use no_grad() for faster inference
42
+ with torch.no_grad():
43
+ outputs = model.generate(
44
+ **inputs,
45
+ max_new_tokens=request.max_tokens,
46
+ do_sample=True, # Enables sampling (use False for deterministic results)
47
+ temperature=0.7, # Adjust for creativity (lower = more conservative)
48
+ top_k=50, # Consider top 50 token choices
49
+ top_p=0.9, # Nucleus sampling (reduces unlikely words)
50
+ repetition_penalty=1.1, # Prevents looping responses
51
+ )
52
+
53
+ # Decode generated tokens
54
+ result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
55
  return {"generated_text": result}
56
+
57
  except Exception as e:
58
  raise HTTPException(status_code=500, detail=str(e))