Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,9 @@ from vllm import LLM, SamplingParams
|
|
5 |
# Load the model and tokenizer from Hugging Face
|
6 |
model_name = "facebook/opt-125m"
|
7 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
8 |
-
|
|
|
|
|
9 |
|
10 |
def generate_response(prompt, max_tokens, temperature, top_p):
|
11 |
# Tokenize the prompt
|
@@ -27,8 +29,8 @@ def generate_response(prompt, max_tokens, temperature, top_p):
|
|
27 |
|
28 |
# Gradio UI
|
29 |
with gr.Blocks() as demo:
|
30 |
-
gr.Markdown("# π Hugging Face Integration with vLLM")
|
31 |
-
gr.Markdown("Generate text using the vLLM integration with Hugging Face models.")
|
32 |
|
33 |
with gr.Row():
|
34 |
with gr.Column():
|
@@ -74,4 +76,4 @@ with gr.Blocks() as demo:
|
|
74 |
)
|
75 |
|
76 |
# Launch the app
|
77 |
-
demo.launch()
|
|
|
5 |
# Load the model and tokenizer from Hugging Face
|
6 |
model_name = "facebook/opt-125m"
|
7 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
8 |
+
|
9 |
+
# Initialize vLLM with CPU-only configuration
|
10 |
+
vllm_model = LLM(model=model_name, tensor_parallel_size=1, device="cpu")
|
11 |
|
12 |
def generate_response(prompt, max_tokens, temperature, top_p):
|
13 |
# Tokenize the prompt
|
|
|
29 |
|
30 |
# Gradio UI
|
31 |
with gr.Blocks() as demo:
|
32 |
+
gr.Markdown("# π Hugging Face Integration with vLLM (CPU)")
|
33 |
+
gr.Markdown("Generate text using the vLLM integration with Hugging Face models on CPU.")
|
34 |
|
35 |
with gr.Row():
|
36 |
with gr.Column():
|
|
|
76 |
)
|
77 |
|
78 |
# Launch the app
|
79 |
+
demo.launch()
|