Skip to content
This repository has been archived by the owner on Mar 30, 2024. It is now read-only.

Commit

Permalink
Add pythia matching, push base image to ghrc.io, remove `modelMount…
Browse files Browse the repository at this point in the history
…Path`/`cacheMountPath`, merge volumes, update README (#53)

Signed-off-by: Hung-Han (Henry) Chen <[email protected]>
  • Loading branch information
chenhunghan authored Aug 13, 2023
1 parent afb5db7 commit 45524c2
Show file tree
Hide file tree
Showing 33 changed files with 121 additions and 270 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/cpu_image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Build and Push Image to Github Container Registry

on:
push:
branches:
- main
paths:
- '**.py'
- 'requirements.txt'
- 'Dockerfile'
- '.github/workflows/cpu_image.yaml'

env:
REGISTRY: ghcr.io
IMAGE_NAME: ialacol
jobs:
image_to_gcr:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile
push: true
tags: |
${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
labels: ${{ steps.meta.outputs.labels }}
57 changes: 18 additions & 39 deletions .github/workflows/smoke_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,22 @@ jobs:
llama-smoke-test:
runs-on: ubuntu-latest
needs: build-image
steps:
steps:
- name: Create k8s Kind Cluster
uses: helm/[email protected]

- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0


- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Install ialacol with LLaMa based model and wait for pods to be ready
run: |
helm repo add ialacol https://chenhunghan.github.io/ialacol
helm repo update
cat > values.yaml <<EOF
replicas: 1
deployment:
Expand All @@ -81,18 +83,11 @@ jobs:
TOP_K: 0
resources:
{}
cache:
persistence:
size: 0.5Gi
accessModes:
- ReadWriteOnce
cacheMountPath: /app/cache
model:
persistence:
size: 2Gi
accessModes:
- ReadWriteOnce
modelMountPath: /app/models
service:
type: ClusterIP
port: $LLAMA_SVC_PORT
Expand All @@ -101,7 +96,7 @@ jobs:
tolerations: []
affinity: {}
EOF
helm install $LLAMA_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
helm install $LLAMA_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 40
Expand Down Expand Up @@ -151,23 +146,22 @@ jobs:
steps:
- name: Create k8s Kind Cluster
uses: helm/[email protected]

- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0

- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with gpt-neox based model and wait for pods to be ready
run: |
helm repo add ialacol https://chenhunghan.github.io/ialacol
helm repo update
cat > values.yaml <<EOF
replicas: 1
deployment:
Expand All @@ -180,18 +174,11 @@ jobs:
REPETITION_PENALTY: 1.176
resources:
{}
cache:
persistence:
size: 0.5Gi
accessModes:
- ReadWriteOnce
cacheMountPath: /app/cache
model:
persistence:
size: 0.5Gi
accessModes:
- ReadWriteOnce
modelMountPath: /app/models
service:
type: ClusterIP
port: $GPT_NEOX_SVC_PORT
Expand All @@ -200,7 +187,7 @@ jobs:
tolerations: []
affinity: {}
EOF
helm install $GPT_NEOX_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
helm install $GPT_NEOX_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 40
Expand All @@ -227,23 +214,22 @@ jobs:
steps:
- name: Create k8s Kind Cluster
uses: helm/[email protected]

- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0

- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with starcoder based model and wait for pods to be ready
run: |
helm repo add ialacol https://chenhunghan.github.io/ialacol
helm repo update
cat > values.yaml <<EOF
replicas: 1
deployment:
Expand All @@ -256,18 +242,11 @@ jobs:
REPETITION_PENALTY: 1.176
resources:
{}
cache:
persistence:
size: 0.5Gi
accessModes:
- ReadWriteOnce
cacheMountPath: /app/cache
model:
persistence:
size: 2Gi
accessModes:
- ReadWriteOnce
modelMountPath: /app/models
service:
type: ClusterIP
port: $STARCODER_SVC_PORT
Expand All @@ -276,7 +255,7 @@ jobs:
tolerations: []
affinity: {}
EOF
helm install $STARCODER_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
helm install $STARCODER_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready"
sleep 20
Expand Down
41 changes: 40 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ And all LLMs supported by [ctransformers](https://github.com/marella/ctransforme

## Quick Start

To quickly get started with ialacol, follow the steps below:
### Kubernetes

`ialacol` offer first class citizen support for Kubernetes, which means you can automate/configure everything compare to runing without.

To quickly get started with ialacol on Kubernetes, follow the steps below:

```sh
helm repo add ialacol https://chenhunghan.github.io/ialacol
Expand Down Expand Up @@ -72,6 +76,41 @@ Alternatively, using OpenAI's client library (see more examples in the `examples
openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m llama-2-7b-chat.ggmlv3.q4_0.bin -g user "Hello world!"
```

### Run in Container

#### Image from Github Registry

There is a [image](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol) hosted on ghcr.io (alternatively [CUDA11](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-cuda11),[CUDA12](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-cuda12),[METAL](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-metal),[GPTQ](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-gptq) variants).

```sh
export DEFAULT_MODEL_HG_REPO_ID="TheBloke/Llama-2-7B-Chat-GGML"
export DEFAULT_MODEL_FILE="llama-2-7b-chat.ggmlv3.q4_0.bin"
# Metal ghcr.io/chenhunghan/ialacol-metal:latest
# CUDA11 ghcr.io/chenhunghan/ialacol-cuda11:latest
# CUDA12 ghcr.io/chenhunghan/ialacol-cuda12:latest
# Metal ghcr.io/chenhunghan/ialacol-metal:latest
export IMAGE="ghcr.io/chenhunghan/ialacol:latest",
docker run --rm -it -p 8000:8000 -e DEFAULT_MODEL_HG_REPO_ID=$DEFAULT_MODEL_HG_REPO_ID -e DEFAULT_MODEL_FILE=$DEFAULT_MODEL_FILE
```

#### From Source

For developers/contributors

Build image

```sh
docker build --file ./Dockerfile -t ialacol .
```

Run container

```sh
export DEFAULT_MODEL_HG_REPO_ID="rustformers/pythia-ggml"
export DEFAULT_MODEL_FILE="pythia-70m-q4_0.bin"
docker run --rm -it -p 8000:8000 -e DEFAULT_MODEL_HG_REPO_ID=$DEFAULT_MODEL_HG_REPO_ID -e DEFAULT_MODEL_FILE=$DEFAULT_MODEL_FILE ialacol
```

## GPU Acceleration

To enable GPU/CUDA acceleration, you need to use the container image built for GPU and add `GPU_LAYERS` environment variable. `GPU_LAYERS` is determine by the size of your GPU memory. See the PR/discussion in [llama.cpp](https://github.com/ggerganov/llama.cpp/pull/1412) to find the best value.
Expand Down
4 changes: 2 additions & 2 deletions charts/ialacol/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
appVersion: 0.10.1
appVersion: 0.10.2
description: A Helm chart for ialacol
name: ialacol
type: application
version: 0.10.1
version: 0.10.2
11 changes: 1 addition & 10 deletions charts/ialacol/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ spec:
value: {{ (.Values.deployment.env).DOWNLOAD_DEFAULT_MODEL | quote }}
- name: LOGGING_LEVEL
value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }}
- name: MODELS_FOLDER
value: {{ (.Values.deployment.env).MODELS_FOLDER | quote }}
- name: CACHE_FOLDER
value: {{ (.Values.deployment.env).CACHE_FOLDER | quote }}
- name: TOP_K
value: {{ (.Values.deployment.env).TOP_K | quote }}
- name: TOP_P
Expand Down Expand Up @@ -64,14 +60,9 @@ spec:
- name: MODE_TYPE
value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
volumeMounts:
- mountPath: {{ .Values.cacheMountPath }}
name: cache
- mountPath: {{ .Values.modelMountPath }}
- mountPath: /app/models
name: model
volumes:
- name: cache
persistentVolumeClaim:
claimName: {{ .Release.Name }}-cache
- name: model
persistentVolumeClaim:
claimName: {{ .Release.Name }}-model
Expand Down
13 changes: 0 additions & 13 deletions charts/ialacol/templates/pvc-cache.yaml

This file was deleted.

18 changes: 1 addition & 17 deletions charts/ialacol/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@ deployment:
# DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
# DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
# LOGGING_LEVEL: DEBUG
# MODELS_FOLDER: models
# CACHE_FOLDER: cache
# THREADS: 8
# BATCH_SIZE: 8
# CONTEXT_LENGTH: 1024
resources:
{}
# limits:
Expand All @@ -21,25 +16,14 @@ resources:
# cpu: 100m
# memory: 128Mi

cache:
persistence:
size: 10Gi
accessModes:
- ReadWriteOnce
# Optional. e.g. "gp2-unencrypted"
storageClassName: ~
# the path to mount the cache volume on the container
cacheMountPath: /app/cache

# The volume where we store the models and downloaded file cachehelm lint
model:
persistence:
size: 24Gi
accessModes:
- ReadWriteOnce
# Optional. e.g. "gp2-unencrypted"
storageClassName: ~
# the path to mount the model volume on the container
modelMountPath: /app/models

service:
type: ClusterIP
Expand Down
9 changes: 0 additions & 9 deletions devspace.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ deployments:
deployment:
image: python:3.11-slim
command: ["sleep", "999999"]
# the path to mount the cache volume on the container
cachePath: /app/cache
tolerations:
- key: "ai"
operator: "Exists"
Expand All @@ -36,13 +34,6 @@ deployments:
operator: In
values:
- "true"
# cache for artifacts downloaded like embedding models
cacheVolume:
pvc:
storageClassName: "gp2-unencrypted"
size: 15Gi
accessModes:
- ReadWriteOnce
dev:
ialacol:
labelSelector:
Expand Down
2 changes: 1 addition & 1 deletion examples/openai/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# create a chat completion
chat_completion = openai.ChatCompletion.create(
model="pythia-70m-q4_0.bin", # the model filename in the env.MODELS_FOLDER directory
model="pythia-70m-q4_0.bin",
messages=[{"role": "user", "content": "Hello world! I am using OpenAI's python client library!"}]
)

Expand Down
2 changes: 1 addition & 1 deletion examples/openai/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
start_time = time.time()

response = openai.ChatCompletion.create(
model="pythia-70m-q4_0.bin", # the model filename in the env.MODELS_FOLDER directory
model="pythia-70m-q4_0.bin",
messages=[
{'role': 'user', 'content': 'Hello, I am a human.'},
],
Expand Down
Loading

0 comments on commit 45524c2

Please sign in to comment.