kelvin-t-lu's picture
init
dbd2ac6
{{- if and .Values.vllm.enabled .Values.tgi.enabled }}
{{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
---
{{- if .Values.h2ogpt.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}
labels:
app: {{ include "h2ogpt.fullname" . }}
spec:
{{- if not .Values.h2ogpt.autoscaling.enabled }}
replicas: {{ .Values.h2ogpt.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}
{{- if .Values.h2ogpt.updateStrategy }}
strategy: {{- toYaml .Values.h2ogpt.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.h2ogpt.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}
spec:
{{- with .Values.h2ogpt.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.h2ogpt.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.h2ogpt.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
{{- if .Values.tgi.enabled }}
initContainers:
- name: tgi-check
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: busybox:1.36
command: ["/bin/sh", "-c"]
args:
- >
until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1;
do
echo "Waiting for inference service to become ready...";
sleep 5;
done
{{- end }}
{{- if .Values.vllm.enabled }}
initContainers:
- name: vllm-check
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: busybox:1.36
command: ["/bin/sh", "-c"]
args:
- >
until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1;
do
echo "Waiting for inference service to become ready...";
sleep 5;
done
{{- end }}
{{- with .Values.vllm.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{ include "h2ogpt.fullname" . }}
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }}
command: []
args:
- /workspace/generate.py
ports:
- name: http
containerPort: 7860
protocol: TCP
- name: gpt
containerPort: 8888
protocol: TCP
{{- if .Values.h2ogpt.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.h2ogpt.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.h2ogpt.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.h2ogpt.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.h2ogpt.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-config
{{- if .Values.tgi.enabled }}
env:
- name: h2ogpt_inference_server
value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}"
{{- end }}
{{- if .Values.vllm.enabled }}
env:
- name: h2ogpt_inference_server
value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}"
{{- end }}
volumeMounts:
- name: h2ogpt-volume
mountPath: /workspace/.cache
subPath: cache
- name: h2ogpt-volume
mountPath: /workspace/save
subPath: save
volumes:
- name: h2ogpt-volume
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.h2ogpt.storage.size | quote }}
storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
{{- end }}
---
{{- if .Values.tgi.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}-tgi-inference
labels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
{{- if not .Values.tgi.autoscaling.enabled }}
replicas: {{ .Values.tgi.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
{{- if .Values.tgi.updateStrategy }}
strategy: {{- toYaml .Values.tgi.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.tgi.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
{{- with .Values.tgi.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tgi.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.tgi.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
containers:
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference
securityContext:
{{- toYaml .Values.tgi.securityContext | nindent 12 }}
image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}"
imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
command: []
args:
{{- range $arg := .Values.tgi.containerArgs }}
- "{{ $arg }}"
{{- end }}
ports:
- name: http
containerPort: 80
protocol: TCP
{{- if .Values.tgi.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.tgi.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.tgi.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.tgi.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.tgi.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config
- secretRef:
name: {{ .Values.tgi.hfSecret }}
volumeMounts:
- name: h2ogpt-tgi-inference-volume
mountPath: /app/cache
subPath: cache
- name: h2ogpt-tgi-inference-volume
mountPath: /data
subPath: data
- name: h2ogpt-tgi-inference-volume
mountPath: /dev/shm
subPath: shm
volumes:
- name: h2ogpt-tgi-inference-volume
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.tgi.storage.size | quote }}
storageClassName: {{ .Values.tgi.storage.class | quote }}
{{- end }}
---
{{- if .Values.vllm.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference
labels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
{{- if not .Values.vllm.autoscaling.enabled }}
replicas: {{ .Values.vllm.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
{{- if .Values.vllm.updateStrategy }}
strategy: {{- toYaml .Values.vllm.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.vllm.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
{{- with .Values.vllm.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.vllm.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
containers:
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference
securityContext:
{{- toYaml .Values.vllm.securityContext | nindent 12 }}
image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
args:
- "-m"
- "vllm.entrypoints.openai.api_server"
- "--port"
- "5000"
- "--host"
- "0.0.0.0"
- "--download-dir"
- "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
- "{{ $arg }}"
{{- end }}
ports:
- name: http
containerPort: 5000
protocol: TCP
{{- if .Values.vllm.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.vllm.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.vllm.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
env:
- name: NCCL_IGNORE_DISABLED_P2P
value: "1"
volumeMounts:
- name: h2ogpt-vllm-inference-volume
mountPath: /workspace/.cache
subPath: cache
- name: h2ogpt-vllm-inference-volume
mountPath: /dev/shm
subPath: shm
volumes:
- name: h2ogpt-vllm-inference-volume
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.vllm.storage.size | quote }}
storageClassName: {{ .Values.vllm.storage.class | quote }}
{{- end }}