File size: 3,264 Bytes
233efeb
 
 
 
 
feeb679
 
233efeb
feeb679
 
233efeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feeb679
 
 
 
 
 
 
 
233efeb
 
 
 
 
feeb679
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
from os import environ as env
from typing import Any, Dict, Union
from llama_cpp import Llama, LlamaGrammar
from pydantic import BaseModel, Field
import runpod


# If your handler runs inference on a model, load the model here.
# You will want models to be loaded into memory before starting serverless.
from huggingface_hub import hf_hub_download
small_repo = "TheBloke/phi-2-GGUF"
small_model="phi-2.Q2_K.gguf"
big_repo = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
big_model = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
LLM_MODEL_PATH =hf_hub_download(
    repo_id=big_repo,
    filename=big_model,
)
print(f"Model downloaded to {LLM_MODEL_PATH}")



in_memory_llm = None

N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
TEMPERATURE = float(env.get("TEMPERATURE", 0.3))

class Movie(BaseModel):
    title: str = Field(..., title="The title of the movie")
    year: int = Field(..., title="The year the movie was released")
    director: str = Field(..., title="The director of the movie")
    genre: str = Field(..., title="The genre of the movie")
    plot:  str = Field(..., title="Plot summary of the movie")

JSON_EXAMPLE_MOVIE = """
{ "title": "The Matrix", "year": 1999, "director": "The Wachowskis", "genre": "Science Fiction", "plot":"Prgrammer realises he lives in simulation and plays key role."
"""

if in_memory_llm is None:
    print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
    in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)

def llm_stream_sans_network(
    prompt: str, pydantic_model_class=Movie, return_pydantic_object=False
) -> Union[str, Dict[str, Any]]:
    schema = pydantic_model_class.model_json_schema()

    # Optional example field from schema, is not needed for the grammar generation
    if "example" in schema:
        del schema["example"]

    json_schema = json.dumps(schema)
    grammar = LlamaGrammar.from_json_schema(json_schema)

    stream = in_memory_llm(
        prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        grammar=grammar,
        stream=True
    )

    output_text = ""
    for chunk in stream:
        result = chunk["choices"][0]
        print(result["text"], end='', flush=True)
        output_text = output_text + result["text"]

    print('\n')

    if return_pydantic_object:
        model_object = pydantic_model_class.model_validate_json(output_text)
        return model_object
    else:
        return output_text


def handler(job):
    """ Handler function that will be used to process jobs. """
    job_input = job['input']

    name = job_input.get('name', 'World')

    #return f"Hello, {name}!"
    return llm_stream_sans_network(
        f"""You need to output JSON objects describing movies.
        For example for the movie called: `The Matrix`: Output: {JSON_EXAMPLE_MOVIE}
        Instruct: Output the JSON object for the movie: `{name}` Output: """)

runpod.serverless.start({"handler": handler})