Add pythia matching, push base image to ghrc.io, remove `modelMount…

…Path`/`cacheMountPath`, merge volumes, update README (#53) Signed-off-by: Hung-Han (Henry) Chen <[email protected]>
chenhunghan · Aug 13, 2023 · 45524c2 · 45524c2
1 parent afb5db7
commit 45524c2
Show file tree

Hide file tree

Showing 33 changed files with 121 additions and 270 deletions.
diff --git a/.github/workflows/cpu_image.yaml b/.github/workflows/cpu_image.yaml
@@ -0,0 +1,46 @@
+name: Build and Push Image to Github Container Registry
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+    - '**.py'
+    - 'requirements.txt'
+    - 'Dockerfile'
+    - '.github/workflows/cpu_image.yaml'
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ialacol
+jobs:
+  image_to_gcr:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/smoke_test.yaml b/.github/workflows/smoke_test.yaml
@@ -56,20 +56,22 @@ jobs:
   llama-smoke-test:
     runs-on: ubuntu-latest
     needs: build-image
-    steps:  
+    steps:
       - name: Create k8s Kind Cluster
         uses: helm/[email protected]
 
       - name: Set up Helm
         uses: azure/setup-helm@v3
         with:
           version: v3.12.0
-
+
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
       - name: Install ialacol with LLaMa based model and wait for pods to be ready
         run: |
-          helm repo add ialacol https://chenhunghan.github.io/ialacol
-          helm repo update
-
           cat > values.yaml <<EOF
           replicas: 1
           deployment:
@@ -81,18 +83,11 @@ jobs:
               TOP_K: 0
           resources:
             {}
-          cache:
-            persistence:
-              size: 0.5Gi
-              accessModes:
-                - ReadWriteOnce
-          cacheMountPath: /app/cache
           model:
             persistence:
               size: 2Gi
               accessModes:
                 - ReadWriteOnce
-          modelMountPath: /app/models
           service:
             type: ClusterIP
             port: $LLAMA_SVC_PORT
@@ -101,7 +96,7 @@ jobs:
           tolerations: []
           affinity: {}
           EOF
-          helm install $LLAMA_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
+          helm install $LLAMA_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
 
           echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
           sleep 40
@@ -151,23 +146,22 @@ jobs:
     steps:  
       - name: Create k8s Kind Cluster
         uses: helm/[email protected]
-
       - name: Set up Helm
         uses: azure/setup-helm@v3
         with:
           version: v3.12.0
-
       - uses: actions/setup-python@v4
         with:
           python-version: 3.11
       - name: Install OpenAI CLI
         run: |
           pip install --upgrade openai --quiet
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 
       - name: Install ialacol with gpt-neox based model and wait for pods to be ready
         run: |
-          helm repo add ialacol https://chenhunghan.github.io/ialacol
-          helm repo update
-
           cat > values.yaml <<EOF
           replicas: 1
           deployment:
@@ -180,18 +174,11 @@ jobs:
               REPETITION_PENALTY: 1.176
           resources:
             {}
-          cache:
-            persistence:
-              size: 0.5Gi
-              accessModes:
-                - ReadWriteOnce
-          cacheMountPath: /app/cache
           model:
             persistence:
               size: 0.5Gi
               accessModes:
                 - ReadWriteOnce
-          modelMountPath: /app/models
           service:
             type: ClusterIP
             port: $GPT_NEOX_SVC_PORT
@@ -200,7 +187,7 @@ jobs:
           tolerations: []
           affinity: {}
           EOF
-          helm install $GPT_NEOX_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
+          helm install $GPT_NEOX_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
 
           echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
           sleep 40
@@ -227,23 +214,22 @@ jobs:
     steps:  
       - name: Create k8s Kind Cluster
         uses: helm/[email protected]
-
       - name: Set up Helm
         uses: azure/setup-helm@v3
         with:
           version: v3.12.0
-
       - uses: actions/setup-python@v4
         with:
           python-version: 3.11
       - name: Install OpenAI CLI
         run: |
           pip install --upgrade openai --quiet
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 
       - name: Install ialacol with starcoder based model and wait for pods to be ready
         run: |
-          helm repo add ialacol https://chenhunghan.github.io/ialacol
-          helm repo update
-
           cat > values.yaml <<EOF
           replicas: 1
           deployment:
@@ -256,18 +242,11 @@ jobs:
               REPETITION_PENALTY: 1.176
           resources:
             {}
-          cache:
-            persistence:
-              size: 0.5Gi
-              accessModes:
-                - ReadWriteOnce
-          cacheMountPath: /app/cache
           model:
             persistence:
               size: 2Gi
               accessModes:
                 - ReadWriteOnce
-          modelMountPath: /app/models
           service:
             type: ClusterIP
             port: $STARCODER_SVC_PORT
@@ -276,7 +255,7 @@ jobs:
           tolerations: []
           affinity: {}
           EOF
-          helm install $STARCODER_HELM_RELEASE_NAME ialacol/ialacol -f values.yaml --namespace $HELM_NAMESPACE
+          helm install $STARCODER_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
 
           echo "Wait for the pod to be ready"
           sleep 20

diff --git a/README.md b/README.md
@@ -41,7 +41,11 @@ And all LLMs supported by [ctransformers](https://github.com/marella/ctransforme
 
 ## Quick Start
 
-To quickly get started with ialacol, follow the steps below:
+### Kubernetes
+
+`ialacol` offer first class citizen support for Kubernetes, which means you can automate/configure everything compare to runing without.
+
+To quickly get started with ialacol on Kubernetes, follow the steps below:
 
 ```sh
 helm repo add ialacol https://chenhunghan.github.io/ialacol
@@ -72,6 +76,41 @@ Alternatively, using OpenAI's client library (see more examples in the `examples
 openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m llama-2-7b-chat.ggmlv3.q4_0.bin -g user "Hello world!"
 ```
 
+### Run in Container
+
+#### Image from Github Registry
+
+There is a [image](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol) hosted on ghcr.io (alternatively [CUDA11](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-cuda11),[CUDA12](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-cuda12),[METAL](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-metal),[GPTQ](https://github.com/chenhunghan/ialacol/pkgs/container/ialacol-gptq) variants).
+
+```sh
+export DEFAULT_MODEL_HG_REPO_ID="TheBloke/Llama-2-7B-Chat-GGML"
+export DEFAULT_MODEL_FILE="llama-2-7b-chat.ggmlv3.q4_0.bin"
+# Metal ghcr.io/chenhunghan/ialacol-metal:latest
+# CUDA11 ghcr.io/chenhunghan/ialacol-cuda11:latest 
+# CUDA12 ghcr.io/chenhunghan/ialacol-cuda12:latest
+# Metal ghcr.io/chenhunghan/ialacol-metal:latest
+export IMAGE="ghcr.io/chenhunghan/ialacol:latest", 
+docker run --rm -it -p 8000:8000 -e DEFAULT_MODEL_HG_REPO_ID=$DEFAULT_MODEL_HG_REPO_ID -e DEFAULT_MODEL_FILE=$DEFAULT_MODEL_FILE 
+```
+
+#### From Source
+
+For developers/contributors
+
+Build image
+
+```sh
+docker build --file ./Dockerfile -t ialacol .
+```
+
+Run container
+
+```sh
+export DEFAULT_MODEL_HG_REPO_ID="rustformers/pythia-ggml"
+export DEFAULT_MODEL_FILE="pythia-70m-q4_0.bin"
+docker run --rm -it -p 8000:8000 -e DEFAULT_MODEL_HG_REPO_ID=$DEFAULT_MODEL_HG_REPO_ID -e DEFAULT_MODEL_FILE=$DEFAULT_MODEL_FILE ialacol
+```
+
 ## GPU Acceleration
 
 To enable GPU/CUDA acceleration, you need to use the container image built for GPU and add `GPU_LAYERS` environment variable. `GPU_LAYERS` is determine by the size of your GPU memory. See the PR/discussion in [llama.cpp](https://github.com/ggerganov/llama.cpp/pull/1412) to find the best value.

diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.10.1
+appVersion: 0.10.2
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.10.1
+version: 0.10.2
diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml
@@ -33,10 +33,6 @@ spec:
             value: {{ (.Values.deployment.env).DOWNLOAD_DEFAULT_MODEL | quote }}
           - name: LOGGING_LEVEL
             value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }}
-          - name: MODELS_FOLDER
-            value: {{ (.Values.deployment.env).MODELS_FOLDER | quote }}
-          - name: CACHE_FOLDER
-            value: {{ (.Values.deployment.env).CACHE_FOLDER | quote }}
           - name: TOP_K
             value: {{ (.Values.deployment.env).TOP_K | quote }}
           - name: TOP_P
@@ -64,14 +60,9 @@ spec:
           - name: MODE_TYPE
             value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
           volumeMounts:
-          - mountPath: {{ .Values.cacheMountPath }}
-            name: cache
-          - mountPath: {{ .Values.modelMountPath }}
+          - mountPath: /app/models
             name: model
       volumes:
-      - name: cache
-        persistentVolumeClaim:
-          claimName: {{ .Release.Name }}-cache
       - name: model
         persistentVolumeClaim:
           claimName: {{ .Release.Name }}-model

diff --git a/charts/ialacol/templates/pvc-cache.yaml b/charts/ialacol/templates/pvc-cache.yaml
diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml
@@ -7,11 +7,6 @@ deployment:
     # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
     # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
     # LOGGING_LEVEL: DEBUG
-    # MODELS_FOLDER: models
-    # CACHE_FOLDER: cache
-    # THREADS: 8
-    # BATCH_SIZE: 8
-    # CONTEXT_LENGTH: 1024
 resources:
   {}
   # limits:
@@ -21,25 +16,14 @@ resources:
   #   cpu: 100m
   #   memory: 128Mi
 
-cache:
-  persistence:
-    size: 10Gi
-    accessModes:
-      - ReadWriteOnce
-    # Optional. e.g. "gp2-unencrypted"
-    storageClassName: ~
-# the path to mount the cache volume on the container
-cacheMountPath: /app/cache
-
+# The volume where we store the models and downloaded file cachehelm lint
 model:
   persistence:
     size: 24Gi
     accessModes:
       - ReadWriteOnce
     # Optional. e.g. "gp2-unencrypted"
     storageClassName: ~
-# the path to mount the model volume on the container
-modelMountPath: /app/models
 
 service:
   type: ClusterIP

diff --git a/devspace.yaml b/devspace.yaml
@@ -21,8 +21,6 @@ deployments:
         deployment:
           image: python:3.11-slim
           command: ["sleep", "999999"]
-          # the path to mount the cache volume on the container
-          cachePath: /app/cache
         tolerations:
           - key: "ai"
             operator: "Exists"
@@ -36,13 +34,6 @@ deployments:
                   operator: In
                   values:
                   - "true"
-        # cache for artifacts downloaded like embedding models
-        cacheVolume:
-          pvc:
-            storageClassName: "gp2-unencrypted"
-            size: 15Gi
-            accessModes:
-            - ReadWriteOnce
 dev:
   ialacol:
     labelSelector:

diff --git a/examples/openai/simple.py b/examples/openai/simple.py
@@ -5,7 +5,7 @@
 
 # create a chat completion
 chat_completion = openai.ChatCompletion.create(
-  model="pythia-70m-q4_0.bin", # the model filename in the env.MODELS_FOLDER directory
+  model="pythia-70m-q4_0.bin",
   messages=[{"role": "user", "content": "Hello world! I am using OpenAI's python client library!"}]
 )
 

diff --git a/examples/openai/stream.py b/examples/openai/stream.py
@@ -13,7 +13,7 @@
 start_time = time.time()
 
 response = openai.ChatCompletion.create(
-    model="pythia-70m-q4_0.bin", # the model filename in the env.MODELS_FOLDER directory
+    model="pythia-70m-q4_0.bin",
     messages=[
         {'role': 'user', 'content': 'Hello, I am a human.'},
     ],