Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update main.py
Browse files
    	
        main.py
    CHANGED
    
    | @@ -2,11 +2,11 @@ from fastapi import FastAPI | |
| 2 | 
             
            from pydantic import BaseModel
         | 
| 3 | 
             
            from huggingface_hub import InferenceClient
         | 
| 4 | 
             
            import uvicorn
         | 
| 5 | 
            -
             | 
| 6 |  | 
| 7 | 
             
            app = FastAPI()
         | 
| 8 |  | 
| 9 | 
            -
            client = InferenceClient("Qwen/Qwen2.5- | 
| 10 |  | 
| 11 | 
             
            class Item(BaseModel):
         | 
| 12 | 
             
                prompt: str
         | 
| @@ -26,9 +26,7 @@ def format_prompt(message, history): | |
| 26 | 
             
                return prompt
         | 
| 27 |  | 
| 28 | 
             
            def generate(item: Item):
         | 
| 29 | 
            -
                temperature = float(item.temperature)
         | 
| 30 | 
            -
                if temperature < 1e-2:
         | 
| 31 | 
            -
                    temperature = 1e-2
         | 
| 32 | 
             
                top_p = float(item.top_p)
         | 
| 33 |  | 
| 34 | 
             
                generate_kwargs = dict(
         | 
| @@ -50,4 +48,16 @@ def generate(item: Item): | |
| 50 |  | 
| 51 | 
             
            @app.post("/generate/")
         | 
| 52 | 
             
            async def generate_text(item: Item):
         | 
| 53 | 
            -
                return {"response": generate(item)}
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 2 | 
             
            from pydantic import BaseModel
         | 
| 3 | 
             
            from huggingface_hub import InferenceClient
         | 
| 4 | 
             
            import uvicorn
         | 
| 5 | 
            +
            import asyncio
         | 
| 6 |  | 
| 7 | 
             
            app = FastAPI()
         | 
| 8 |  | 
| 9 | 
            +
            client = InferenceClient(model="Qwen/Qwen2.5-7B")
         | 
| 10 |  | 
| 11 | 
             
            class Item(BaseModel):
         | 
| 12 | 
             
                prompt: str
         | 
|  | |
| 26 | 
             
                return prompt
         | 
| 27 |  | 
| 28 | 
             
            def generate(item: Item):
         | 
| 29 | 
            +
                temperature = max(float(item.temperature), 1e-2)
         | 
|  | |
|  | |
| 30 | 
             
                top_p = float(item.top_p)
         | 
| 31 |  | 
| 32 | 
             
                generate_kwargs = dict(
         | 
|  | |
| 48 |  | 
| 49 | 
             
            @app.post("/generate/")
         | 
| 50 | 
             
            async def generate_text(item: Item):
         | 
| 51 | 
            +
                return {"response": generate(item)}
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            @app.on_event("startup")
         | 
| 54 | 
            +
            async def preload_model():
         | 
| 55 | 
            +
                # Check if the model is already loaded
         | 
| 56 | 
            +
                status = client.get_model_status()
         | 
| 57 | 
            +
                if not status.loaded:
         | 
| 58 | 
            +
                    # Trigger model loading by making a dummy request
         | 
| 59 | 
            +
                    dummy_prompt = "This is a dummy prompt to load the model."
         | 
| 60 | 
            +
                    client.text_generation(dummy_prompt, max_new_tokens=1)
         | 
| 61 | 
            +
                    # Optionally, wait until the model is loaded
         | 
| 62 | 
            +
                    while not client.get_model_status().loaded:
         | 
| 63 | 
            +
                        await asyncio.sleep(5)  # Wait for 5 seconds before checking again
         |