Spaces:

kelvin-t-lu
/

chatbot

Paused

App Files Files Community

chatbot / helm /h2ogpt-chart /templates /deployment.yaml

kelvin-t-lu

init

dbd2ac6 over 1 year ago

raw

history blame contribute delete

12.7 kB

	{{- if and .Values.vllm.enabled .Values.tgi.enabled }}
	{{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }}
	{{- end }}
	---
	{{- if .Values.h2ogpt.enabled }}
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: {{ include "h2ogpt.fullname" . }}
	labels:
	app: {{ include "h2ogpt.fullname" . }}
	spec:
	{{- if not .Values.h2ogpt.autoscaling.enabled }}
	replicas: {{ .Values.h2ogpt.replicaCount }}
	{{- end }}
	selector:
	matchLabels:
	app: {{ include "h2ogpt.fullname" . }}
	{{- if .Values.h2ogpt.updateStrategy }}
	strategy: {{- toYaml .Values.h2ogpt.updateStrategy \| nindent 4 }}
	{{- end }}
	template:
	metadata:
	{{- with .Values.h2ogpt.podAnnotations }}
	annotations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	labels:
	app: {{ include "h2ogpt.fullname" . }}
	spec:
	{{- with .Values.h2ogpt.nodeSelector }}
	nodeSelector:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	{{- with .Values.h2ogpt.tolerations }}
	tolerations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	securityContext:
	{{- toYaml .Values.h2ogpt.podSecurityContext \| nindent 8 }}
	affinity:
	podAntiAffinity:
	preferredDuringSchedulingIgnoredDuringExecution:
	- weight: 100
	podAffinityTerm:
	labelSelector:
	matchExpressions:
	- key: app
	operator: In
	values:
	- {{ include "h2ogpt.fullname" . }}
	topologyKey: failure-domain.beta.kubernetes.io/zone
	{{- if .Values.tgi.enabled }}
	initContainers:
	- name: tgi-check
	securityContext:
	{{- toYaml .Values.h2ogpt.securityContext \| nindent 12 }}
	image: busybox:1.36
	command: ["/bin/sh", "-c"]
	args:
	- >
	until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1;
	do
	echo "Waiting for inference service to become ready...";
	sleep 5;
	done
	{{- end }}
	{{- if .Values.vllm.enabled }}
	initContainers:
	- name: vllm-check
	securityContext:
	{{- toYaml .Values.h2ogpt.securityContext \| nindent 12 }}
	image: busybox:1.36
	command: ["/bin/sh", "-c"]
	args:
	- >
	until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1;
	do
	echo "Waiting for inference service to become ready...";
	sleep 5;
	done
	{{- end }}
	{{- with .Values.vllm.imagePullSecrets }}
	imagePullSecrets:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	containers:
	- name: {{ include "h2ogpt.fullname" . }}
	securityContext:
	{{- toYaml .Values.h2ogpt.securityContext \| nindent 12 }}
	image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag \| default .Chart.AppVersion }}"
	imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }}
	command: []
	args:
	- /workspace/generate.py
	ports:
	- name: http
	containerPort: 7860
	protocol: TCP
	- name: gpt
	containerPort: 8888
	protocol: TCP
	{{- if .Values.h2ogpt.livenessProbe }}
	livenessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.h2ogpt.livenessProbe \| nindent 12 }}
	{{- end }}
	{{- if .Values.h2ogpt.readinessProbe }}
	readinessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.h2ogpt.readinessProbe \| nindent 12 }}
	{{- end }}
	resources:
	{{- toYaml .Values.h2ogpt.resources \| nindent 12 }}
	envFrom:
	- configMapRef:
	name: {{ include "h2ogpt.fullname" . }}-config
	{{- if .Values.tgi.enabled }}
	env:
	- name: h2ogpt_inference_server
	value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}"
	{{- end }}
	{{- if .Values.vllm.enabled }}
	env:
	- name: h2ogpt_inference_server
	value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}"
	{{- end }}
	volumeMounts:
	- name: h2ogpt-volume
	mountPath: /workspace/.cache
	subPath: cache
	- name: h2ogpt-volume
	mountPath: /workspace/save
	subPath: save
	volumes:
	- name: h2ogpt-volume
	ephemeral:
	volumeClaimTemplate:
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: {{ .Values.h2ogpt.storage.size \| quote }}
	storageClassName: {{ .Values.h2ogpt.storage.class \| quote }}
	{{- end }}
	---
	{{- if .Values.tgi.enabled }}
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: {{ include "h2ogpt.fullname" . }}-tgi-inference
	labels:
	app: {{ include "h2ogpt.fullname" . }}-tgi-inference
	spec:
	{{- if not .Values.tgi.autoscaling.enabled }}
	replicas: {{ .Values.tgi.replicaCount }}
	{{- end }}
	selector:
	matchLabels:
	app: {{ include "h2ogpt.fullname" . }}-tgi-inference
	{{- if .Values.tgi.updateStrategy }}
	strategy: {{- toYaml .Values.tgi.updateStrategy \| nindent 4 }}
	{{- end }}
	template:
	metadata:
	{{- with .Values.tgi.podAnnotations }}
	annotations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	labels:
	app: {{ include "h2ogpt.fullname" . }}-tgi-inference
	spec:
	{{- with .Values.tgi.nodeSelector }}
	nodeSelector:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	{{- with .Values.tgi.tolerations }}
	tolerations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	securityContext:
	{{- toYaml .Values.tgi.podSecurityContext \| nindent 8 }}
	affinity:
	podAntiAffinity:
	preferredDuringSchedulingIgnoredDuringExecution:
	- weight: 100
	podAffinityTerm:
	labelSelector:
	matchExpressions:
	- key: app
	operator: In
	values:
	- {{ include "h2ogpt.fullname" . }}
	topologyKey: failure-domain.beta.kubernetes.io/zone
	containers:
	- name: {{ include "h2ogpt.fullname" . }}-tgi-inference
	securityContext:
	{{- toYaml .Values.tgi.securityContext \| nindent 12 }}
	image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}"
	imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
	command: []
	args:
	{{- range $arg := .Values.tgi.containerArgs }}
	- "{{ $arg }}"
	{{- end }}
	ports:
	- name: http
	containerPort: 80
	protocol: TCP
	{{- if .Values.tgi.livenessProbe }}
	livenessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.tgi.livenessProbe \| nindent 12 }}
	{{- end }}
	{{- if .Values.tgi.readinessProbe }}
	readinessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.tgi.readinessProbe \| nindent 12 }}
	{{- end }}
	resources:
	{{- toYaml .Values.tgi.resources \| nindent 12 }}
	envFrom:
	- configMapRef:
	name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config
	- secretRef:
	name: {{ .Values.tgi.hfSecret }}
	volumeMounts:
	- name: h2ogpt-tgi-inference-volume
	mountPath: /app/cache
	subPath: cache
	- name: h2ogpt-tgi-inference-volume
	mountPath: /data
	subPath: data
	- name: h2ogpt-tgi-inference-volume
	mountPath: /dev/shm
	subPath: shm
	volumes:
	- name: h2ogpt-tgi-inference-volume
	ephemeral:
	volumeClaimTemplate:
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: {{ .Values.tgi.storage.size \| quote }}
	storageClassName: {{ .Values.tgi.storage.class \| quote }}
	{{- end }}
	---
	{{- if .Values.vllm.enabled }}
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: {{ include "h2ogpt.fullname" . }}-vllm-inference
	labels:
	app: {{ include "h2ogpt.fullname" . }}-vllm-inference
	spec:
	{{- if not .Values.vllm.autoscaling.enabled }}
	replicas: {{ .Values.vllm.replicaCount }}
	{{- end }}
	selector:
	matchLabels:
	app: {{ include "h2ogpt.fullname" . }}-vllm-inference
	{{- if .Values.vllm.updateStrategy }}
	strategy: {{- toYaml .Values.vllm.updateStrategy \| nindent 4 }}
	{{- end }}
	template:
	metadata:
	{{- with .Values.vllm.podAnnotations }}
	annotations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	labels:
	app: {{ include "h2ogpt.fullname" . }}-vllm-inference
	spec:
	{{- with .Values.vllm.nodeSelector }}
	nodeSelector:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	{{- with .Values.vllm.tolerations }}
	tolerations:
	{{- toYaml . \| nindent 8 }}
	{{- end }}
	securityContext:
	{{- toYaml .Values.vllm.podSecurityContext \| nindent 8 }}
	affinity:
	podAntiAffinity:
	preferredDuringSchedulingIgnoredDuringExecution:
	- weight: 100
	podAffinityTerm:
	labelSelector:
	matchExpressions:
	- key: app
	operator: In
	values:
	- {{ include "h2ogpt.fullname" . }}
	topologyKey: failure-domain.beta.kubernetes.io/zone
	containers:
	- name: {{ include "h2ogpt.fullname" . }}-vllm-inference
	securityContext:
	{{- toYaml .Values.vllm.securityContext \| nindent 12 }}
	image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag \| default .Chart.AppVersion }}"
	imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
	command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
	args:
	- "-m"
	- "vllm.entrypoints.openai.api_server"
	- "--port"
	- "5000"
	- "--host"
	- "0.0.0.0"
	- "--download-dir"
	- "/workspace/.cache/huggingface/hub"
	{{- range $arg := .Values.vllm.containerArgs }}
	- "{{ $arg }}"
	{{- end }}
	ports:
	- name: http
	containerPort: 5000
	protocol: TCP
	{{- if .Values.vllm.livenessProbe }}
	livenessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.vllm.livenessProbe \| nindent 12 }}
	{{- end }}
	{{- if .Values.vllm.readinessProbe }}
	readinessProbe:
	httpGet:
	path: /
	scheme: HTTP
	port: http
	{{- toYaml .Values.vllm.readinessProbe \| nindent 12 }}
	{{- end }}
	resources:
	{{- toYaml .Values.vllm.resources \| nindent 12 }}
	envFrom:
	- configMapRef:
	name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
	env:
	- name: NCCL_IGNORE_DISABLED_P2P
	value: "1"
	volumeMounts:
	- name: h2ogpt-vllm-inference-volume
	mountPath: /workspace/.cache
	subPath: cache
	- name: h2ogpt-vllm-inference-volume
	mountPath: /dev/shm
	subPath: shm
	volumes:
	- name: h2ogpt-vllm-inference-volume
	ephemeral:
	volumeClaimTemplate:
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: {{ .Values.vllm.storage.size \| quote }}
	storageClassName: {{ .Values.vllm.storage.class \| quote }}
	{{- end }}