diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index e471c1b..0000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "image": "mcr.microsoft.com/devcontainers/universal:2",
-  "features": {
-    "ghcr.io/devcontainers/features/git:1": {},
-    "ghcr.io/devcontainers/features/github-cli:1": {},
-    "ghcr.io/devcontainers/features/python:1": {
-      "version": "3.11"
-    }
-  }
-}
diff --git a/.github/workflows/cuda_image.yaml b/.github/workflows/cuda_image.yaml
deleted file mode 100644
index ce93656..0000000
--- a/.github/workflows/cuda_image.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-name: Build and Push CUDA Image to Github Container Registry
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-    - '**.py'
-    - 'requirements.txt'
-    - 'Dockerfile.cuda11'
-    - 'Dockerfile.cuda12'
-    - '.github/workflows/cuda_image.yaml'
-
-env:
-  REGISTRY: ghcr.io
-  CUDA_11_IMAGE_NAME: ialacol-cuda11
-  CUDA_12_IMAGE_NAME: ialacol-cuda12
-jobs:
-  cuda11_image_to_gcr:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.CUDA_11_IMAGE_NAME }}
-      - uses: docker/setup-qemu-action@v3
-      - uses: docker/setup-buildx-action@v3
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile.cuda11
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.CUDA_11_IMAGE_NAME }}:${{ github.sha }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.CUDA_11_IMAGE_NAME }}:latest
-          labels: ${{ steps.meta.outputs.labels }}
-  cuda12_image_to_gcr:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.CUDA_12_IMAGE_NAME }}
-      - uses: docker/setup-qemu-action@v3
-      - uses: docker/setup-buildx-action@v3
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile.cuda12
-          platforms: linux/amd64, linux/arm64
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.CUDA_12_IMAGE_NAME }}:${{ github.sha }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.CUDA_12_IMAGE_NAME }}:latest
-          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/gptq_image.yaml b/.github/workflows/gptq_image.yaml
deleted file mode 100644
index f41069a..0000000
--- a/.github/workflows/gptq_image.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: Build and Push GPTQ Image to Github Container Registry
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-    - '**.py'
-    - 'requirements.txt'
-    - 'Dockerfile.gptq'
-    - '.github/workflows/gptq_image.yaml'
-
-env:
-  REGISTRY: ghcr.io
-  GPTQ_IMAGE_NAME: ialacol-gptq
-jobs:
-  gptq_image_to_gcr:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.GPTQ_IMAGE_NAME }}
-      - uses: docker/setup-qemu-action@v3
-      - uses: docker/setup-buildx-action@v3
-      # Workaround to provide additional free space.
-      # https://github.com/actions/virtual-environments/issues/2840
-      # https://github.com/actions/runner-images/issues/2606#issuecomment-772683150
-      - run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /usr/local/share/boost
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile.gptq
-          platforms: linux/amd64, linux/arm64
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:${{ github.sha }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:latest
-          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/metal_image.yaml b/.github/workflows/metal_image.yaml
deleted file mode 100644
index 7e8e959..0000000
--- a/.github/workflows/metal_image.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: Build and Push Metal Image to Github Container Registry
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-    - '**.py'
-    - 'requirements.txt'
-    - 'Dockerfile.metal'
-    - '.github/workflows/metal_image.yaml'
-
-env:
-  REGISTRY: ghcr.io
-  METAL_IMAGE_NAME: ialacol-metal
-jobs:
-  metal_image_to_gcr:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.METAL_IMAGE_NAME }}
-      - uses: docker/setup-qemu-action@v3
-      - uses: docker/setup-buildx-action@v3
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile.metal
-          platforms: linux/amd64, linux/arm64
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.METAL_IMAGE_NAME }}:${{ github.sha }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.METAL_IMAGE_NAME }}:latest
-          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/smoke_test.yaml b/.github/workflows/smoke_test.yaml
deleted file mode 100644
index 076374a..0000000
--- a/.github/workflows/smoke_test.yaml
+++ /dev/null
@@ -1,379 +0,0 @@
-name: Smoke Test
-
-on: pull_request
-
-env:
-  REGISTRY: quay.io
-  REPO_ORG_NAME: ialacol
-  IMAGE_NAME: ialacol-smoke-test
-  GPTQ_IMAGE_TAG: gptq
-  HELM_NAMESPACE: default
-  LOGGING_LEVEL: DEBUG
-  # for testing llama base models
-  LLAMA_HELM_RELEASE_NAME: orca-mini-3b
-  LLAMA_MODEL_HG_REPO_ID: TheBloke/orca_mini_3B-GGML
-  LLAMA_MODEL_FILE: orca-mini-3b.ggmlv3.q4_0.bin
-  LLAMA_SVC_PORT: 8000
-  # for testing gpt-neox base models
-  GPT_NEOX_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b
-  GPT_NEOX_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GGML
-  GPT_NEOX_MODEL_FILE: stablecode-instruct-alpha-3b.ggmlv1.q4_0.bin
-  GPT_NEOX_SVC_PORT: 8001
-  # for testing starcoder base models
-  STARCODER_HELM_RELEASE_NAME: tiny-starcoder-py
-  STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML
-  STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin
-  STARCODER_SVC_PORT: 8002
-  # for testing gptq models
-  GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq
-  GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ
-  GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True
-  GPTQ_MODEL_FILE: model.safetensors
-  GPTQ_SVC_PORT: 8003
-
-jobs:
-  build-image:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Login to Registry
-        uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ secrets.QUAY_ROBOT_USERNAME }}
-          password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
-
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./Dockerfile
-          platforms: linux/amd64, linux/arm64
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
-  build-gptq-image:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Login to Registry
-        uses: docker/login-action@v2
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ secrets.QUAY_ROBOT_USERNAME }}
-          password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
-
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./Dockerfile.gptq
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
-  llama-smoke-test:
-    runs-on: ubuntu-latest
-    needs: build-image
-    steps:
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.7.0
-
-      - name: Set up Helm
-        uses: azure/setup-helm@v3
-        with:
-          version: v3.12.0
-
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Install ialacol with LLaMa based model and wait for pods to be ready
-        run: |
-          cat > values.yaml <<EOF
-          replicas: 1
-          deployment:
-            image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
-            env:
-              DEFAULT_MODEL_HG_REPO_ID: $LLAMA_MODEL_HG_REPO_ID
-              DEFAULT_MODEL_FILE: $LLAMA_MODEL_FILE
-              LOGGING_LEVEL: $LOGGING_LEVEL
-              TOP_K: 0
-          resources:
-            {}
-          model:
-            persistence:
-              size: 2Gi
-              accessModes:
-                - ReadWriteOnce
-          service:
-            type: ClusterIP
-            port: $LLAMA_SVC_PORT
-            annotations: {}
-          nodeSelector: {}
-          tolerations: []
-          affinity: {}
-          EOF
-          helm install $LLAMA_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
-
-          echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
-          sleep 120
-      - if: always()
-        run: |
-          kubectl get pods -n $HELM_NAMESPACE
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-      - name: Port forward to the LLaMa model service
-        run: |
-          kubectl port-forward svc/$LLAMA_HELM_RELEASE_NAME $LLAMA_SVC_PORT:$LLAMA_SVC_PORT &
-          echo "Wait for port-forward to be ready"
-          sleep 5
-      - name: Check the GET /v1/models endpoint
-        run: |
-          curl http://localhost:$LLAMA_SVC_PORT/v1/models
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install OpenAI CLI
-        run: |
-          pip install --upgrade openai --quiet
-      - name: Test the OpenAI CLI with default parameters
-        run: |
-          openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api models.list
-          openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Hello world!"
-          openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api completions.create -m $LLAMA_MODEL_FILE -p "Who are"
-      - name: Ask the AI for a joke
-        run: |
-          REPLY=$(openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Tell me a joke." --max-tokens 4096 --temperature 2 --top_p 1.0)
-          REPLY=$(echo $REPLY | tr -d '\n')
-          echo "$REPLY"
-
-          if [ -z "$REPLY" ]; then
-            echo "No reply from AI"
-            exit 1
-          fi
-
-          echo "REPLY=$REPLY" >> $GITHUB_ENV
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-  gpt-neox-smoke-test:
-    runs-on: ubuntu-latest
-    needs: build-image
-    steps:
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.7.0
-      - name: Set up Helm
-        uses: azure/setup-helm@v3
-        with:
-          version: v3.12.0
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install OpenAI CLI
-        run: |
-          pip install --upgrade openai --quiet
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Install ialacol with gpt-neox based model and wait for pods to be ready
-        run: |
-          cat > values.yaml <<EOF
-          replicas: 1
-          deployment:
-            image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
-            env:
-              DEFAULT_MODEL_HG_REPO_ID: $GPT_NEOX_MODEL_HG_REPO_ID
-              DEFAULT_MODEL_FILE: $GPT_NEOX_MODEL_FILE
-              LOGGING_LEVEL: $LOGGING_LEVEL
-              TOP_K: 40
-              REPETITION_PENALTY: 1.176
-          resources:
-            {}
-          model:
-            persistence:
-              size: 0.5Gi
-              accessModes:
-                - ReadWriteOnce
-          service:
-            type: ClusterIP
-            port: $GPT_NEOX_SVC_PORT
-            annotations: {}
-          nodeSelector: {}
-          tolerations: []
-          affinity: {}
-          EOF
-          helm install $GPT_NEOX_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
-
-          echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
-          sleep 120
-      - if: always()
-        run: |
-          kubectl get pods -n $HELM_NAMESPACE
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-      - name: Port forward to the gpt-neox model service
-        run: |
-          kubectl port-forward svc/$GPT_NEOX_HELM_RELEASE_NAME $GPT_NEOX_SVC_PORT:$GPT_NEOX_SVC_PORT &
-          echo "Wait for port-forward to be ready"
-          sleep 5
-      - name: Check model response
-        run: |
-          openai -k "sk-fake" -b http://localhost:$GPT_NEOX_SVC_PORT/v1 -vvvvv api completions.create -m $GPT_NEOX_MODEL_FILE -p "A function adding 1 to 1 in Python."
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-  starcoder-smoke-test:
-    runs-on: ubuntu-latest
-    needs: build-image
-    steps:
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.7.0
-      - name: Set up Helm
-        uses: azure/setup-helm@v3
-        with:
-          version: v3.12.0
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install OpenAI CLI
-        run: |
-          pip install --upgrade openai --quiet
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Install ialacol with starcoder based model and wait for pods to be ready
-        run: |
-          cat > values.yaml <<EOF
-          replicas: 1
-          deployment:
-            image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
-            env:
-              DEFAULT_MODEL_HG_REPO_ID: $STARCODER_MODEL_HG_REPO_ID
-              DEFAULT_MODEL_FILE: $STARCODER_MODEL_FILE
-              LOGGING_LEVEL: $LOGGING_LEVEL
-              TOP_K: 40
-              REPETITION_PENALTY: 1.176
-          resources:
-            {}
-          model:
-            persistence:
-              size: 2Gi
-              accessModes:
-                - ReadWriteOnce
-          service:
-            type: ClusterIP
-            port: $STARCODER_SVC_PORT
-            annotations: {}
-          nodeSelector: {}
-          tolerations: []
-          affinity: {}
-          EOF
-          helm install $STARCODER_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
-
-          echo "Wait for the pod to be ready"
-          sleep 120
-      - if: always()
-        run: |
-          kubectl get pods -n $HELM_NAMESPACE
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-      - name: Port forward to the starcoder model service
-        run: |
-          kubectl port-forward svc/$STARCODER_HELM_RELEASE_NAME $STARCODER_SVC_PORT:$STARCODER_SVC_PORT &
-          echo "Wait for port-forward to be ready"
-          sleep 5
-      - name: Check model response
-        run: |
-          openai -k "sk-fake" -b http://localhost:$STARCODER_SVC_PORT/v1 -vvvvv api completions.create -m $STARCODER_MODEL_FILE -p "def fibonnaci"
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-  gptq-smoke-test:
-    runs-on: ubuntu-latest
-    needs: build-gptq-image
-    steps:
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.7.0
-
-      - name: Set up Helm
-        uses: azure/setup-helm@v3
-        with:
-          version: v3.12.0
-
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Install ialacol with GPTQ model from a revision and wait for pods to be ready
-        run: |
-          cat > values.yaml <<EOF
-          replicas: 1
-          deployment:
-            image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
-            env:
-              DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID
-              DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION
-              DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE
-              MODEL_TYPE: "gptq"
-              LOGGING_LEVEL: $LOGGING_LEVEL
-          resources:
-            {}
-          model:
-            persistence:
-              size: 3Gi
-              accessModes:
-                - ReadWriteOnce
-          service:
-            type: ClusterIP
-            port: $GPTQ_SVC_PORT
-            annotations: {}
-          nodeSelector: {}
-          tolerations: []
-          affinity: {}
-          EOF
-          helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
-
-          echo "Wait for the pod to be ready, GPTQ image is around 1GB"
-          sleep 240
-      - if: always()
-        run: |
-          kubectl get pods -n $HELM_NAMESPACE
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE
-      - name: Port forward to the GPTQ model service
-        run: |
-          kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT &
-          echo "Wait for port-forward to be ready"
-          sleep 5
-      - name: Check the GET /v1/models endpoint
-        run: |
-          curl http://localhost:$GPTQ_SVC_PORT/v1/models
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install OpenAI CLI
-        run: |
-          pip install --upgrade openai --quiet
-      # We can only test if download works and if GET /models returns something on CPU CI workers
-      - name: Test the OpenAI CLI with default parameters
-        run: |
-          openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list
-      - if: always()
-        run: |
-          kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 9e13006..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,631 +0,0 @@
-[MAIN]
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-# Clear in-memory caches upon conclusion of linting. Useful if running pylint
-# in a server-like mode.
-clear-cache-post-run=no
-
-# Load and enable all available extensions. Use --list-extensions to see a list
-# all available extensions.
-#enable-all-extensions=
-
-# In error mode, messages with a category besides ERROR or FATAL are
-# suppressed, and no reports are done by default. Error mode is compatible with
-# disabling specific errors.
-#errors-only=
-
-# Always return a 0 (non-error) status code, even if lint errors are found.
-# This is primarily useful in continuous integration scripts.
-#exit-zero=
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code.
-extension-pkg-allow-list=
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
-# for backward compatibility.)
-extension-pkg-whitelist=pydantic # why: https://github.com/pydantic/pydantic/issues/1961#issuecomment-759522422
-
-# Return non-zero exit code if any of these messages/categories are detected,
-# even if score is above --fail-under value. Syntax same as enable. Messages
-# specified are enabled, while categories only check already-enabled messages.
-fail-on=
-
-# Specify a score threshold under which the program will exit with error.
-fail-under=10
-
-# Interpret the stdin as a python script, whose filename needs to be passed as
-# the module_or_package argument.
-#from-stdin=
-
-# Files or directories to be skipped. They should be base names, not paths.
-ignore=CVS
-
-# Add files or directories matching the regular expressions patterns to the
-# ignore-list. The regex matches against paths and can be in Posix or Windows
-# format. Because '\\' represents the directory delimiter on Windows systems,
-# it can't be used as an escape character.
-ignore-paths=
-
-# Files or directories matching the regular expression patterns are skipped.
-# The regex matches against base names, not paths. The default value ignores
-# Emacs file locks
-ignore-patterns=^\.#
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis). It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use, and will cap the count on Windows to
-# avoid hangs.
-jobs=1
-
-# Control the amount of potential inferred values when inferring a single
-# object. This can help the performance when dealing with large functions or
-# complex, nested conditions.
-limit-inference-results=100
-
-# List of plugins (as comma separated values of python module names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# Minimum Python version to use for version dependent checks. Will default to
-# the version used to run pylint.
-py-version=3.11
-
-# Discover python modules and packages in the file system subtree.
-recursive=no
-
-# Add paths to the list of the source roots. Supports globbing patterns. The
-# source root is an absolute path or a path relative to the current working
-# directory used to determine a package namespace for modules located under the
-# source root.
-# source-roots=
-
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-# In verbose mode, extra non-checker-related info will be displayed.
-#verbose=
-
-
-[BASIC]
-
-# Naming style matching correct argument names.
-argument-naming-style=snake_case
-
-# Regular expression matching correct argument names. Overrides argument-
-# naming-style. If left empty, argument names will be checked with the set
-# naming style.
-#argument-rgx=
-
-# Naming style matching correct attribute names.
-attr-naming-style=snake_case
-
-# Regular expression matching correct attribute names. Overrides attr-naming-
-# style. If left empty, attribute names will be checked with the set naming
-# style.
-#attr-rgx=
-
-# Bad variable names which should always be refused, separated by a comma.
-bad-names=foo,
-          bar,
-          baz,
-          toto,
-          tutu,
-          tata
-
-# Bad variable names regexes, separated by a comma. If names match any regex,
-# they will always be refused
-bad-names-rgxs=
-
-# Naming style matching correct class attribute names.
-class-attribute-naming-style=any
-
-# Regular expression matching correct class attribute names. Overrides class-
-# attribute-naming-style. If left empty, class attribute names will be checked
-# with the set naming style.
-#class-attribute-rgx=
-
-# Naming style matching correct class constant names.
-class-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct class constant names. Overrides class-
-# const-naming-style. If left empty, class constant names will be checked with
-# the set naming style.
-#class-const-rgx=
-
-# Naming style matching correct class names.
-class-naming-style=PascalCase
-
-# Regular expression matching correct class names. Overrides class-naming-
-# style. If left empty, class names will be checked with the set naming style.
-#class-rgx=
-
-# Naming style matching correct constant names.
-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct constant names. Overrides const-naming-
-# style. If left empty, constant names will be checked with the set naming
-# style.
-#const-rgx=
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-# Naming style matching correct function names.
-function-naming-style=snake_case
-
-# Regular expression matching correct function names. Overrides function-
-# naming-style. If left empty, function names will be checked with the set
-# naming style.
-#function-rgx=
-
-# Good variable names which should always be accepted, separated by a comma.
-good-names=i,
-           j,
-           k,
-           ex,
-           Run,
-           _
-
-# Good variable names regexes, separated by a comma. If names match any regex,
-# they will always be accepted
-good-names-rgxs=
-
-# Include a hint for the correct naming format with invalid-name.
-include-naming-hint=no
-
-# Naming style matching correct inline iteration names.
-inlinevar-naming-style=any
-
-# Regular expression matching correct inline iteration names. Overrides
-# inlinevar-naming-style. If left empty, inline iteration names will be checked
-# with the set naming style.
-#inlinevar-rgx=
-
-# Naming style matching correct method names.
-method-naming-style=snake_case
-
-# Regular expression matching correct method names. Overrides method-naming-
-# style. If left empty, method names will be checked with the set naming style.
-#method-rgx=
-
-# Naming style matching correct module names.
-module-naming-style=snake_case
-
-# Regular expression matching correct module names. Overrides module-naming-
-# style. If left empty, module names will be checked with the set naming style.
-#module-rgx=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=^_
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-# These decorators are taken in consideration only for invalid-name.
-property-classes=abc.abstractproperty
-
-# Regular expression matching correct type alias names. If left empty, type
-# alias names will be checked with the set naming style.
-#typealias-rgx=
-
-# Regular expression matching correct type variable names. If left empty, type
-# variable names will be checked with the set naming style.
-#typevar-rgx=
-
-# Naming style matching correct variable names.
-variable-naming-style=snake_case
-
-# Regular expression matching correct variable names. Overrides variable-
-# naming-style. If left empty, variable names will be checked with the set
-# naming style.
-#variable-rgx=
-
-
-[CLASSES]
-
-# Warn about protected attribute access inside special methods
-check-protected-access-in-special-methods=no
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp,
-                      asyncSetUp,
-                      __post_init__
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-
-[DESIGN]
-
-# List of regular expressions of class ancestor names to ignore when counting
-# public methods (see R0903)
-exclude-too-few-public-methods=
-
-# List of qualified class names to ignore when counting class parents (see
-# R0901)
-ignored-parents=
-
-# Maximum number of arguments for function / method.
-max-args=5
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Maximum number of boolean expressions in an if statement (see R0916).
-max-bool-expr=5
-
-# Maximum number of branch for function / method body.
-max-branches=12
-
-# Maximum number of locals for function / method body.
-max-locals=15
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of return / yield for function / method body.
-max-returns=6
-
-# Maximum number of statements in function / method body.
-max-statements=50
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when caught.
-overgeneral-exceptions=builtins.BaseException,builtins.Exception
-
-
-[FORMAT]
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Maximum number of characters on a single line.
-max-line-length=100
-
-# Maximum number of lines in a module.
-max-module-lines=1000
-
-# Allow the body of a class to be on the same line as the declaration if body
-# contains single statement.
-single-line-class-stmt=no
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-
-[IMPORTS]
-
-# List of modules that can be imported at any level, not just the top level
-# one.
-allow-any-import-level=
-
-# Allow explicit reexports by alias from a package __init__.
-allow-reexport-from-package=no
-
-# Allow wildcard imports from modules that define __all__.
-allow-wildcard-with-all=no
-
-# Deprecated modules which should not be used, separated by a comma.
-deprecated-modules=
-
-# Output a graph (.gv or any supported image format) of external dependencies
-# to the given file (report RP0402 must not be disabled).
-ext-import-graph=
-
-# Output a graph (.gv or any supported image format) of all (i.e. internal and
-# external) dependencies to the given file (report RP0402 must not be
-# disabled).
-import-graph=
-
-# Output a graph (.gv or any supported image format) of internal dependencies
-# to the given file (report RP0402 must not be disabled).
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-# Couples of modules and preferred modules, separated by a comma.
-preferred-modules=
-
-
-[LOGGING]
-
-# The type of string formatting that logging methods do. `old` means using %
-# formatting, `new` is for `{}` formatting.
-logging-format-style=old
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format.
-logging-modules=logging
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
-# UNDEFINED.
-confidence=HIGH,
-           CONTROL_FLOW,
-           INFERENCE,
-           INFERENCE_FAILURE,
-           UNDEFINED
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then re-enable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=raw-checker-failed,
-        bad-inline-option,
-        locally-disabled,
-        file-ignored,
-        suppressed-message,
-        useless-suppression,
-        deprecated-pragma,
-        use-symbolic-message-instead
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=c-extension-no-member
-
-
-[METHOD_ARGS]
-
-# List of qualified names (i.e., library.method) which require a timeout
-# parameter e.g. 'requests.api.get,requests.api.post'
-timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,
-      XXX,
-      TODO
-
-# Regular expression of note tags to take in consideration.
-notes-rgx=
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit,argparse.parse_error
-
-
-[REPORTS]
-
-# Python expression which should return a score less than or equal to 10. You
-# have access to the variables 'fatal', 'error', 'warning', 'refactor',
-# 'convention', and 'info' which contain the number of messages in each
-# category, as well as 'statement' which is the total number of statements
-# analyzed. This score is used by the global evaluation report (RP0004).
-evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details.
-msg-template=
-
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
-#output-format=
-
-# Tells whether to display a full report or only the messages.
-reports=no
-
-# Activate the evaluation score.
-score=yes
-
-
-[SIMILARITIES]
-
-# Comments are removed from the similarity computation
-ignore-comments=yes
-
-# Docstrings are removed from the similarity computation
-ignore-docstrings=yes
-
-# Imports are removed from the similarity computation
-ignore-imports=yes
-
-# Signatures are removed from the similarity computation
-ignore-signatures=yes
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-
-[SPELLING]
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-# Spelling dictionary name. No available dictionaries : You need to install
-# both the python package and the system dependency for enchant to work..
-spelling-dict=
-
-# List of comma separated words that should be considered directives if they
-# appear at the beginning of a comment and should not be checked.
-spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains the private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to the private dictionary (see the
-# --spelling-private-dict-file option) instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[STRING]
-
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=no
-
-# This flag controls whether the implicit-str-concat should generate a warning
-# on implicit string concatenation in sequences defined over several lines.
-check-str-concat-over-line-jumps=no
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# List of symbolic message names to ignore for Mixin members.
-ignored-checks-for-mixins=no-member,
-                          not-async-context-manager,
-                          not-context-manager,
-                          attribute-defined-outside-init
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-# Regex pattern to define which classes are considered mixins.
-mixin-class-rgx=.*[Mm]ixin
-
-# List of decorators that change the signature of a decorated function.
-signature-mutators=
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of names allowed to shadow builtins
-allowed-redefined-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,
-          _cb
-
-# A regular expression matching the name of dummy variables (i.e. expected to
-# not be used).
-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
-
-# Argument names that match this expression will be ignored.
-ignored-argument-names=_.*|^ignored_|^unused_
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7ddbe37..0967ef4 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,14 +1 @@
-{
-  "[python]": {
-    "editor.defaultFormatter": "ms-python.black-formatter"
-  },
-  "python.formatting.provider": "none",
-  "python.linting.pylintEnabled": true,
-  "python.linting.enabled": true,
-  "python.linting.pylintArgs": [
-    "--rcfile",
-    ".pylintrc"
-  ],
-  "python.linting.maxNumberOfProblems": 50,
-  "python.linting.lintOnSave": true
-}
+{}
diff --git a/Dockerfile.cuda11 b/Dockerfile.cuda11
deleted file mode 100644
index 62612b2..0000000
--- a/Dockerfile.cuda11
+++ /dev/null
@@ -1,15 +0,0 @@
-# syntax=docker/dockerfile:1
-
-# 11.7.1 https://github.com/ggerganov/llama.cpp/blob/master/.devops/main-cuda.Dockerfile
-FROM nvidia/cuda:11.7.1-base-ubuntu22.04
-RUN apt-get update && apt-get install -y -q python3 python3-pip curl
-WORKDIR /app
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-COPY requirements.txt requirements.txt
-RUN pip3 install -r requirements.txt
-# https://github.com/marella/ctransformers#cuda
-RUN pip3 install ctransformers[cuda]
-COPY . .
-EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.cuda12 b/Dockerfile.cuda12
deleted file mode 100644
index 6a85ec8..0000000
--- a/Dockerfile.cuda12
+++ /dev/null
@@ -1,19 +0,0 @@
-# syntax=docker/dockerfile:1
-
-FROM nvidia/cuda:12.2.0-base-ubuntu22.04
-RUN apt-get update && apt-get install -y -q python3 python3-pip curl
-WORKDIR /app
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-COPY requirements.txt requirements.txt
-# Fixes No such file or directory: 'maturin'
-RUN pip3 install maturin
-RUN pip3 install -r requirements.txt
-# Fixes The package you are trying to install is only a placeholder project on PyPI.org repository.
-# This package is hosted on NVIDIA Python Package Index.
-RUN pip3 install --extra-index-url=https://pypi.ngc.nvidia.com --trusted-host pypi.ngc.nvidia.com nvidia-cublas-cu12
-# https://github.com/marella/ctransformers#cuda
-RUN pip3 install ctransformers[cuda]
-COPY . .
-EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.gptq b/Dockerfile.gptq
deleted file mode 100644
index a623f96..0000000
--- a/Dockerfile.gptq
+++ /dev/null
@@ -1,20 +0,0 @@
-# syntax=docker/dockerfile:1
-
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends g++ python3-dev python3-pip curl \
-    && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-COPY requirements.txt requirements.txt
-# Fixes No such file or directory: 'maturin'
-RUN pip3 install maturin
-RUN pip3 install -r requirements.txt
-# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
-RUN pip3 install numpy
-# https://github.com/marella/ctransformers#gptq
-RUN pip3 install ctransformers[gptq]
-COPY . .
-EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.metal b/Dockerfile.metal
deleted file mode 100644
index f944e86..0000000
--- a/Dockerfile.metal
+++ /dev/null
@@ -1,14 +0,0 @@
-# syntax=docker/dockerfile:1
-
-FROM python:3.11-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y -q curl build-essential
-COPY requirements.txt requirements.txt
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN pip3 install -r requirements.txt
-# https://github.com/marella/ctransformers#metal
-RUN CT_METAL=1 pip3 install ctransformers --no-binary ctransformers
-COPY . .
-EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml
index 2a06c51..0d02b5d 100644
--- a/charts/ialacol/values.yaml
+++ b/charts/ialacol/values.yaml
@@ -2,7 +2,6 @@ replicas: 1
 
 deployment:
   image: ghcr.io/chenhunghan/ialacol:latest
-  # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest`
   # env:
     # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
     # DEFAULT_MODEL_HG_REPO_REVISION: main
diff --git a/const.py b/const.py
deleted file mode 100644
index 13b561f..0000000
--- a/const.py
+++ /dev/null
@@ -1,3 +0,0 @@
-DEFAULT_MAX_TOKENS = "512"
-DEFAULT_CONTEXT_LENGTH = "4096"
-DEFAULT_LOG_LEVEL = "INFO"
\ No newline at end of file
diff --git a/examples/openai/README.md b/examples/openai/README.md
deleted file mode 100644
index d630587..0000000
--- a/examples/openai/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Examples for using openai python library
-
-```sh
-python3 -m venv .venv
-source .venv/bin/activate
-python3 -m pip install -r requirements.txt
-
-python3 simple.py
-python3 stream.py
-```
diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
deleted file mode 100644
index c10d277..0000000
--- a/examples/openai/requirements.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-aiohttp==3.9.0
-aiosignal==1.3.1
-async-timeout==4.0.2
-attrs==23.1.0
-certifi==2023.7.22
-charset-normalizer==3.1.0
-frozenlist==1.3.3
-idna==3.4
-multidict==6.0.4
-openai==0.27.7
-requests==2.31.0
-tqdm==4.65.0
-urllib3==2.0.7
-yarl==1.9.2
diff --git a/examples/openai/simple.py b/examples/openai/simple.py
deleted file mode 100644
index 472054a..0000000
--- a/examples/openai/simple.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import openai
-
-openai.api_key = "placeholder_to_avoid_exception" # needed to avoid an exception
-openai.api_base = "http://localhost:8000/v1" # this is the public address of the ialacol server
-
-# create a chat completion
-chat_completion = openai.ChatCompletion.create(
-  model="pythia-70m-q4_0.bin",
-  messages=[{"role": "user", "content": "Hello world! I am using OpenAI's python client library!"}]
-)
-
-# print the chat completion
-print(chat_completion.choices[0].message.content)
diff --git a/examples/openai/stream.py b/examples/openai/stream.py
deleted file mode 100644
index d7cc9b7..0000000
--- a/examples/openai/stream.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import openai
-import time
-
-openai.api_key = "placeholder_to_avoid_exception" # needed to avoid an exception
-openai.api_base = "http://localhost:8000/v1" # this is the public address of the ialacol server
-
-# Rest are from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_stream_completions.ipynb
-#
-# Example of an OpenAI ChatCompletion request with stream=True
-# https://platform.openai.com/docs/guides/chat
-
-# record the time before the request is sent
-start_time = time.time()
-
-response = openai.ChatCompletion.create(
-    model="pythia-70m-q4_0.bin",
-    messages=[
-        {'role': 'user', 'content': 'Hello, I am a human.'},
-    ],
-    stream=True  # we set stream=True
-)
-
-# create variables to collect the stream of chunks
-collected_chunks = []
-collected_messages = []
-# iterate through the stream of events
-for chunk in response:
-    chunk_time = time.time() - start_time  # calculate the time delay of the chunk
-    collected_chunks.append(chunk)  # save the event response
-    chunk_message = chunk['choices'][0]['delta']  # extract the message
-    collected_messages.append(chunk_message)  # save the message
-    print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}")  # print the delay and text
-
-# print the time delay and text received
-print(f"Full response received {chunk_time:.2f} seconds after request")
-full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
-print(f"Full conversation received: {full_reply_content}")
diff --git a/get_config.py b/get_config.py
deleted file mode 100644
index 02c1cc3..0000000
--- a/get_config.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from ctransformers import Config
-
-from request_body import ChatCompletionRequestBody, CompletionRequestBody
-from get_env import get_env, get_env_or_none
-from get_default_thread import get_default_thread
-from log import log
-from const import DEFAULT_MAX_TOKENS, DEFAULT_CONTEXT_LENGTH
-
-THREADS = int(get_env("THREADS", str(get_default_thread())))
-
-
-def get_config(
-    body: CompletionRequestBody | ChatCompletionRequestBody,
-) -> Config:
-    # ggml only, follow ctransformers defaults
-    TOP_K = int(get_env("TOP_K", "40"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p
-    TOP_P = float(get_env("TOP_P", "1.0"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-temperature
-    TEMPERATURE = float(get_env("TEMPERATURE", "1"))
-    # ggml only, follow ctransformers defaults
-    REPETITION_PENALTY = float(get_env("REPETITION_PENALTY", "1.1"))
-    # ggml only, follow ctransformers defaults
-    LAST_N_TOKENS = int(get_env("LAST_N_TOKENS", "64"))
-    # ggml only, follow ctransformers defaults
-    SEED = int(get_env("SEED", "-1"))
-    # ggml only, follow ctransformers defaults
-    BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
-    MAX_TOKENS = int(get_env("MAX_TOKENS", DEFAULT_MAX_TOKENS))
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
-    if MAX_TOKENS > CONTEXT_LENGTH:
-        log.warning(
-            "MAX_TOKENS is greater than CONTEXT_LENGTH, setting MAX_TOKENS < CONTEXT_LENGTH"
-        )
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
-    STOP = get_env_or_none("STOP")
-
-    log.debug("TOP_K: %s", TOP_K)
-    log.debug("TOP_P: %s", TOP_P)
-    log.debug("TEMPERATURE: %s", TEMPERATURE)
-    log.debug("REPETITION_PENALTY: %s", REPETITION_PENALTY)
-    log.debug("LAST_N_TOKENS: %s", LAST_N_TOKENS)
-    log.debug("SEED: %s", SEED)
-    log.debug("BATCH_SIZE: %s", BATCH_SIZE)
-    log.debug("THREADS: %s", THREADS)
-    log.debug("MAX_TOKENS: %s", MAX_TOKENS)
-    log.debug("STOP: %s", STOP)
-
-    top_k = body.top_k if body.top_k else TOP_K
-    top_p = body.top_p if body.top_p else TOP_P
-    temperature = body.temperature if body.temperature else TEMPERATURE
-    repetition_penalty = (
-        body.frequency_penalty
-        if body.frequency_penalty
-        else (
-            body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY
-        )
-    )
-    last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS
-    seed = body.seed if body.seed else SEED
-    batch_size = body.batch_size if body.batch_size else BATCH_SIZE
-    threads = body.threads if body.threads else THREADS
-    max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS
-    stop = body.stop if body.stop else STOP
-
-    config = Config(
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
-        stop=stop,
-    )
-
-    return config
diff --git a/get_default_thread.py b/get_default_thread.py
deleted file mode 100644
index ce93ca0..0000000
--- a/get_default_thread.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-
-
-def get_default_thread() -> int:
-    """_summary_
-    Automatically get the default number of threads to use for generation
-    """
-    count = os.cpu_count()
-    if count is not None:
-        return int(count / 2)
-    else:
-        return 8
diff --git a/get_env.py b/get_env.py
deleted file mode 100644
index 31a725c..0000000
--- a/get_env.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import os
-
-
-def get_env(key: str, default_value: str):
-    """_summary_
-    Fallback to default of the env get by key is not set or is empty string
-    """
-    env = os.environ.get(key)
-    if env is None or len(env) == 0:
-        return default_value
-    else:
-        return env
-
-def get_env_or_none(key: str):
-    """_summary_
-    Fallback to None of the env get by key is not set or is empty string
-    """
-    env = os.environ.get(key)
-    if env is None or len(env) == 0:
-        return None
-    else:
-        return env
diff --git a/get_model_type.py b/get_model_type.py
deleted file mode 100644
index f064e45..0000000
--- a/get_model_type.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from get_env import get_env
-
-
-def get_model_type(
-    filename: str,
-) -> str:
-    ctransformer_model_type = "llama"
-    filename = filename.lower()
-    # These are also in "starcoder" format
-    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
-    # https://huggingface.co/TheBloke/minotaur-15B-GGML
-    if (
-        "star" in filename
-        or "starchat" in filename
-        or "WizardCoder" in filename
-        or "minotaur-15" in filename
-    ):
-        ctransformer_model_type = "gpt_bigcode"
-    if "llama" in filename:
-        ctransformer_model_type = "llama"
-    if "mpt" in filename:
-        ctransformer_model_type = "mpt"
-    if "replit" in filename:
-        ctransformer_model_type = "replit"
-    if "falcon" in filename:
-        ctransformer_model_type = "falcon"
-    if "dolly" in filename:
-        ctransformer_model_type = "dolly-v2"
-    if "stablelm" in filename:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
-    if "stablecode" in filename:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/EleutherAI/pythia-70m
-    if "pythia" in filename:
-        ctransformer_model_type = "gpt_neox"
-    # codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
-    # https://huggingface.co/Salesforce/codegen-2B-multi
-    # https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
-    if "codegen" in filename:
-        ctransformer_model_type = "gptj"
-
-    DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "")
-    if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename:
-        ctransformer_model_type = "gptq"
-
-    MODE_TYPE = get_env("MODE_TYPE", "")
-    if len(MODE_TYPE) > 0:
-        ctransformer_model_type = MODE_TYPE
-    return ctransformer_model_type
diff --git a/log.py b/log.py
deleted file mode 100644
index f16ad6a..0000000
--- a/log.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import logging
-
-from get_env import get_env
-from const import DEFAULT_LOG_LEVEL
-
-LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL)
-
-log = logging.getLogger("uvicorn")
-try:
-    log.setLevel(LOGGING_LEVEL)
-except ValueError:
-    log.setLevel(DEFAULT_LOG_LEVEL)
diff --git a/main.py b/main.py
deleted file mode 100644
index d69b3ec..0000000
--- a/main.py
+++ /dev/null
@@ -1,444 +0,0 @@
-"""_summary_
-
-This module contains the main FastAPI application.
-"""
-import os
-
-from typing import (
-    Awaitable,
-    Callable,
-    Union,
-    Annotated,
-)
-from fastapi import FastAPI, Depends, HTTPException, Body, Request, status
-from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse
-from fastapi.responses import StreamingResponse
-from ctransformers import LLM, AutoModelForCausalLM, Config
-from huggingface_hub import hf_hub_download, snapshot_download
-from get_config import get_config
-from get_model_type import get_model_type
-
-from request_body import ChatCompletionRequestBody, CompletionRequestBody
-from response_body import ChatCompletionResponseBody, CompletionResponseBody
-from streamers import chat_completions_streamer, completions_streamer
-from model_generate import chat_model_generate, model_generate
-from get_env import get_env
-from log import log
-from truncate import truncate
-from const import DEFAULT_CONTEXT_LENGTH
-
-DEFAULT_MODEL_HG_REPO_ID = get_env(
-    "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
-)
-DEFAULT_MODEL_HG_REPO_REVISION = get_env("DEFAULT_MODEL_HG_REPO_REVISION", "main")
-DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin")
-
-log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID)
-log.info("DEFAULT_MODEL_HG_REPO_REVISION: %s", DEFAULT_MODEL_HG_REPO_REVISION)
-log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE)
-
-DOWNLOADING_MODEL = False
-LOADING_MODEL = False
-
-
-def set_downloading_model(boolean: bool):
-    """_summary_
-
-    Args:
-        boolean (bool): the boolean value to set DOWNLOADING_MODEL to
-    """
-    globals()["DOWNLOADING_MODEL"] = boolean
-    log.debug("DOWNLOADING_MODEL set to %s", globals()["DOWNLOADING_MODEL"])
-
-
-def set_loading_model(boolean: bool):
-    """_summary_
-
-    Args:
-        boolean (bool): the boolean value to set LOADING_MODEL to
-    """
-    globals()["LOADING_MODEL"] = boolean
-    log.debug("LOADING_MODEL set to %s", globals()["LOADING_MODEL"])
-
-
-Sender = Callable[[Union[str, bytes]], Awaitable[None]]
-Generate = Callable[[Sender], Awaitable[None]]
-
-
-app = FastAPI()
-
-
-# https://github.com/tiangolo/fastapi/issues/3361
-@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request: Request, exc: RequestValidationError):
-    exc_str = f"{exc}".replace("\n", " ").replace("   ", " ")
-    log.error("%s: %s", request, exc_str)
-    content = {"status_code": 10422, "message": exc_str, "data": None}
-    return JSONResponse(
-        content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
-    )
-
-
-@app.on_event("startup")
-async def startup_event():
-    """_summary_
-    Starts up the server, setting log level, downloading the default model if necessary.
-    """
-    log.info("Starting up...")
-    model_type = get_model_type(DEFAULT_MODEL_FILE)
-    if DEFAULT_MODEL_HG_REPO_ID:
-        set_downloading_model(True)
-
-        try:
-            if model_type == "gptq":
-                log.info(
-                    "Downloading repo %s to %s/models",
-                    DEFAULT_MODEL_HG_REPO_ID,
-                    os.getcwd(),
-                )
-                snapshot_download(
-                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
-                    revision=DEFAULT_MODEL_HG_REPO_REVISION,
-                    cache_dir="models/.cache",
-                    local_dir="models",
-                    resume_download=True,
-                )
-            elif DEFAULT_MODEL_FILE:
-                log.info(
-                    "Downloading model... %s/%s to %s/models",
-                    DEFAULT_MODEL_HG_REPO_ID,
-                    DEFAULT_MODEL_FILE,
-                    os.getcwd(),
-                )
-                hf_hub_download(
-                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
-                    revision=DEFAULT_MODEL_HG_REPO_REVISION,
-                    cache_dir="models/.cache",
-                    local_dir="models",
-                    filename=DEFAULT_MODEL_FILE,
-                    resume_download=True,
-                )
-        except Exception as exception:
-            log.error("Error downloading model: %s", exception)
-        finally:
-            set_downloading_model(False)
-
-    # ggml only, follow ctransformers defaults
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
-    # the layers to offloading to the GPU
-    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
-
-    log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
-    log.debug("GPU_LAYERS: %s", GPU_LAYERS)
-
-    config = Config(
-        context_length=CONTEXT_LENGTH,
-        gpu_layers=GPU_LAYERS,
-    )
-
-    log.info(
-        "Creating llm singleton with model_type: %s",
-        model_type,
-    )
-    set_loading_model(True)
-    if model_type == "gptq":
-        log.debug("Creating llm/gptq instance...")
-        llm = AutoModelForCausalLM.from_pretrained(
-            model_path_or_repo_id=f"{os.getcwd()}/models",
-            model_type="gptq",
-            local_files_only=True,
-        )
-        app.state.llm = llm
-    else:
-        log.debug("Creating llm/ggml instance...")
-        llm = LLM(
-            model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
-            config=config,
-            model_type=model_type,
-        )
-        app.state.llm = llm
-    log.info("llm singleton created.")
-    set_loading_model(False)
-
-
-@app.get("/v1/models")
-async def models():
-    """_summary_
-
-    Returns:
-        _type_: a list of models
-    """
-    if DOWNLOADING_MODEL is True:
-        raise HTTPException(status_code=503, detail="Downloading model")
-    if LOADING_MODEL is True:
-        raise HTTPException(status_code=503, detail="Loading model in memory")
-    return {
-        "data": [
-            {
-                "id": DEFAULT_MODEL_FILE,
-                "object": "model",
-                "owned_by": "community",
-                "permission": [],
-            }
-        ],
-        "object": "list",
-    }
-
-
-@app.post("/v1/completions", response_model=CompletionResponseBody)
-async def completions(
-    body: Annotated[CompletionRequestBody, Body()],
-    config: Annotated[Config, Depends(get_config)],
-    request: Request,
-):
-    """_summary_
-        Compatible with https://platform.openai.com/docs/api-reference/completions
-    Args:
-        body (CompletionRequestBody): parsed request body
-
-    Returns:
-        StreamingResponse: streaming response
-    """
-    if DOWNLOADING_MODEL is True:
-        raise HTTPException(status_code=503, detail="Downloading model")
-    log.debug("Body:%s", str(body))
-    if (
-        (body.n is not None)
-        or (body.logit_bias is not None)
-        or (body.user is not None)
-        or (body.presence_penalty is not None)
-        or (body.frequency_penalty is not None)
-    ):
-        log.warning(
-            "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte."
-        )
-    prompt = body.prompt
-
-    model_name = body.model
-    llm = request.app.state.llm
-    if body.stream is True:
-        log.debug("Streaming response from %s", model_name)
-        return StreamingResponse(
-            completions_streamer(prompt, model_name, llm, config),
-            media_type="text/event-stream",
-        )
-    return model_generate(prompt, model_name, llm, config)
-
-
-@app.post("/v1/engines/{engine}/completions")
-async def engine_completions(
-    # Can't use body as FastAPI require corrent context-type header
-    # But copilot client maybe not send such header
-    request: Request,
-    # copilot client ONLY request param
-    engine: str,
-):
-    """_summary_
-        Similar to https://platform.openai.com/docs/api-reference/completions
-        but with engine param and with /v1/engines
-    Args:
-        body (CompletionRequestBody): parsed request body
-    Returns:
-        StreamingResponse: streaming response
-    """
-    if DOWNLOADING_MODEL is True:
-        raise HTTPException(status_code=503, detail="Downloading model")
-    json = await request.json()
-    log.debug("Body:%s", str(json))
-
-    body = CompletionRequestBody(**json, model=engine)
-    prompt = truncate(body.prompt)
-
-    config = get_config(body)
-    llm = request.app.state.llm
-    if body.stream is True:
-        log.debug("Streaming response from %s", engine)
-        return StreamingResponse(
-            completions_streamer(prompt, engine, llm, config),
-            media_type="text/event-stream",
-        )
-    return model_generate(prompt, engine, llm, config)
-
-
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponseBody)
-async def chat_completions(
-    body: Annotated[ChatCompletionRequestBody, Body()],
-    config: Annotated[Config, Depends(get_config)],
-    request: Request,
-):
-    """_summary_
-        Compatible with https://platform.openai.com/docs/api-reference/chat
-    Args:
-        body (ChatCompletionRequestBody): parsed request body
-
-    Returns:
-        StreamingResponse: streaming response
-    """
-    if DOWNLOADING_MODEL is True:
-        raise HTTPException(status_code=503, detail="Downloading model")
-    log.debug("Body:%s", str(body))
-    if (
-        (body.n is not None)
-        or (body.logit_bias is not None)
-        or (body.user is not None)
-        or (body.presence_penalty is not None)
-        or (body.frequency_penalty is not None)
-    ):
-        log.warning(
-            "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte."
-        )
-    system_start = ""
-    system = "You are a helpful assistant."
-    system_end = ""
-    user_start = ""
-    user_end = ""
-    assistant_start = ""
-    assistant_end = ""
-
-    # https://huggingface.co/blog/llama2#how-to-prompt-llama-2
-    # https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/discussions/3
-    if "llama-2" in body.model.lower() and "chat" in body.model.lower():
-        system_start = "<s>[INST] <<SYS>>\n"
-        system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
-        system_end = "<</SYS>>\n\n"
-        assistant_start = " "
-        assistant_end = " </s><s>[INST] "
-        user_start = ""
-        user_end = " [/INST]"
-    # For most instruct fine-tuned models using  Alpaca prompt template
-    # Although instruct fine-tuned models are not tuned for chat, they can be to generate response as if chatting, using Alpaca
-    # prompt template likely gives better results than using the default prompt template
-    # See https://github.com/tatsu-lab/stanford_alpaca#data-release
-    if "instruct" in body.model.lower():
-        system_start = ""
-        system = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
-        system_end = ""
-        assistant_start = "### Response:"
-        assistant_end = ""
-        user_start = "### Instruction:\n"
-        user_end = "\n\n"
-    # For instruct fine-tuned models using mistral prompt template
-    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
-    if "mistral" in body.model.lower() and "instruct" in body.model.lower():
-        system_start = "<s>"
-        system = ""
-        system_end = ""
-        assistant_start = ""
-        assistant_end = "</s> "
-        user_start = "[INST] "
-        user_end = " [/INST]"
-    if "starchat" in body.model.lower():
-        # See https://huggingface.co/blog/starchat-alpha and https://huggingface.co/TheBloke/starchat-beta-GGML#prompt-template
-        system_start = "<|system|>"
-        system = (
-            "Below is a dialogue between a human and an AI assistant called StarChat."
-        )
-        system_end = " <|end|>\n"
-        user_start = "<|user|>"
-        user_end = " <|end|>\n"
-        assistant_start = "<|assistant|>\n"
-        assistant_end = " <|end|>\n"
-    if "airoboros" in body.model.lower():
-        # e.g. A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. USER: [prompt] ASSISTANT:
-        # see https://huggingface.co/jondurbin/airoboros-mpt-30b-gpt4-1p4-five-epochs
-        system_start = ""
-        system = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
-        system_end = ""
-        user_start = "USER: "
-        user_end = ""
-        assistant_start = "ASSISTANT: "
-        assistant_end = ""
-    # If it's a mpt-chat model, we need to add the default prompt
-    # from https://huggingface.co/TheBloke/mpt-30B-chat-GGML#prompt-template
-    # and https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py#L17
-    if "mpt" in body.model.lower() and "chat" in body.model.lower():
-        system_start = "<|im_start|>system\n"
-        system = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
-        system_end = "<|im_end|>\n"
-        assistant_start = "<|im_start|>assistant\n"
-        assistant_end = "<|im_end|>\n"
-        user_start = "<|im_start|>user\n"
-        user_end = "<|im_end|>\n"
-    # orca mini https://huggingface.co/pankajmathur/orca_mini_3b
-    if "orca" in body.model.lower() and "mini" in body.model.lower():
-        system_start = "### System:\n"
-        system = "You are an AI assistant that follows instruction extremely well. Help as much as you can."
-        system_end = "\n\n"
-        assistant_start = "### Response:\n"
-        assistant_end = ""
-        # v3 e.g. https://huggingface.co/pankajmathur/orca_mini_v3_13b
-        if "v3" in body.model.lower():
-            assistant_start = "### Assistant:\n"
-        user_start = "### User:\n"
-        user_end = "\n\n"
-    # openchat_3.5 https://huggingface.co/openchat/openchat_3.5
-    if "openchat" in body.model.lower():
-        system_start = ""
-        system = ""
-        system_end = ""
-        assistant_start = "GPT4 Assistant: "
-        assistant_end = "<|end_of_turn|>"
-        user_start = "GPT4 User: "
-        user_end = "<|end_of_turn|>"
-    # HG's zephyr https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
-    if "zephyr" in body.model.lower():
-        system_start = "<|system|>\n"
-        system = ""
-        system_end = "</s>\n"
-        assistant_start = "<|assistant|>"
-        assistant_end = "\n"
-        user_start = "<|user|>\n"
-        user_end = "</s>\n"
-
-    prompt = ""
-    for message in body.messages:
-        # Check for system message
-        if message.role == "system":
-            system_message_content = message.content if message else ""
-
-            # avoid duplicate system_start token in prompt if system_message_content already includes it
-            if len(system_start) > 0 and system_start in system_message_content:
-                system_start = ""
-            # avoid duplicate system_end token in prompt if system_message_content already includes it
-            if len(system_end) > 0 and system_end in system_message_content:
-                system_end = ""
-            prompt = f"{system_start}{system_message_content}{system_end}"
-        elif message.role == "user":
-            user_message_content = message.content if message else ""
-
-            # avoid duplicate user start token in prompt if user_message_content already includes it
-            if len(user_start) > 0 and user_start in user_message_content:
-                user_start = ""
-            # avoid duplicate user end token in prompt if user_message_content already includes it
-            if len(user_end) > 0 and user_end in user_message_content:
-                user_end = ""
-
-            prompt = f"{prompt}{user_start}{user_message_content}{user_end}"
-        elif message.role == "assistant":
-            assistant_message_content = message.content if message else ""
-
-            # avoid duplicate assistant start token in prompt if user message already includes it
-            if (
-                len(assistant_start) > 0
-                and assistant_start in assistant_message_content
-            ):
-                assistant_start = ""
-            # avoid duplicate assistant start token in prompt if user message already includes it
-            if len(assistant_end) > 0 and assistant_end in assistant_message_content:
-                assistant_end = ""
-
-            prompt = (
-                f"{prompt}{assistant_start}{assistant_message_content}{assistant_end}"
-            )
-
-    prompt = f"{prompt}{assistant_start}"
-    model_name = body.model
-    llm = request.app.state.llm
-    if body.stream is True:
-        log.debug("Streaming response from %s", model_name)
-        return StreamingResponse(
-            chat_completions_streamer(prompt, model_name, llm, config),
-            media_type="text/event-stream",
-        )
-    return chat_model_generate(prompt, model_name, llm, config)
diff --git a/model_generate.py b/model_generate.py
deleted file mode 100644
index b623917..0000000
--- a/model_generate.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from time import time
-from ctransformers import LLM, Config
-
-from log import log
-
-
-def model_generate(
-    prompt: str,
-    model_name: str,
-    llm: LLM,
-    config: Config,
-):
-    """_summary_
-    returns the response body for /chat/completions
-    """
-    created = time()
-
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    log.debug("prompt: %s", prompt)
-
-    log.debug("Getting from ctransformer instance")
-    result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
-        prompt=prompt,
-        stream=False,
-        reset=True,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
-        stop=stop,
-    )
-    http_response = {
-        "id": "id",
-        "object": "text_completion",
-        "created": created,
-        "model": model_name,
-        "choices": [
-            {
-                "index": 0,
-                "text": result,
-                "logprobs": None,
-                "finish_reason": "end_of_token",
-            }
-        ],
-        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-    }
-    log.debug("http_response:%s ", http_response)
-    return http_response
-
-
-def chat_model_generate(
-    prompt: str,
-    model_name: str,
-    llm: LLM,
-    config: Config,
-):
-    """_summary_
-    returns the response body for /chat/completions
-    """
-    created = time()
-
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    log.debug("prompt: %s", prompt)
-
-    log.debug("Getting from ctransformer instance")
-    result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
-        prompt=prompt,
-        stream=False,
-        reset=True,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
-        stop=stop,
-    )
-    http_response = {
-        "id": "id",
-        "object": "chat.completion",
-        "created": created,
-        "model": model_name,
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": result,
-                },
-                "finish_reason": "end_of_token",
-            }
-        ],
-        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-    }
-    log.debug("http_response:%s ", http_response)
-    return http_response
diff --git a/request_body.py b/request_body.py
deleted file mode 100644
index ce2999e..0000000
--- a/request_body.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from typing import (
-    Any,
-    Literal,
-    List,
-    Optional,
-)
-from pydantic import BaseModel, Field
-
-
-class CompletionRequestBody(BaseModel):
-    """_summary_
-    from from https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
-    """
-
-    prompt: str = Field(
-        default="", description="The prompt to generate completions for."
-    )
-    max_tokens: Optional[int]
-    temperature: Optional[float]
-    top_p: Optional[float]
-    stop: Optional[List[str] | str]
-    stream: Optional[bool] = Field()
-    model: str = Field()
-    # llama.cpp specific parameters
-    top_k: Optional[int]
-    repetition_penalty: Optional[float]
-    frequency_penalty: Optional[float]
-    last_n_tokens: Optional[int]
-    seed: Optional[int]
-    batch_size: Optional[int]
-    threads: Optional[int]
-
-    # ignored or currently unsupported
-    suffix: Any
-    presence_penalty: Any
-    echo: Any
-    n: Any
-    logprobs: Any
-    best_of: Any
-    logit_bias: Any
-    user: Any
-
-    class Config:
-        arbitrary_types_allowed = True
-
-
-class ChatCompletionRequestMessage(BaseModel):
-    """_summary_
-    from from  https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
-    """
-
-    role: Literal["system", "user", "assistant"] = Field(
-        default="user", description="The role of the message."
-    )
-    content: str = Field(default="", description="The content of the message.")
-
-
-class ChatCompletionRequestBody(BaseModel):
-    """_summary_
-    Request body for /chat/completions.
-    from from  https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
-    """
-
-    messages: List[ChatCompletionRequestMessage] = Field(
-        default=[], description="A list of messages to generate completions for."
-    )
-    max_tokens: Optional[int]
-    temperature: Optional[float]
-    top_p: Optional[float]
-    stop: Optional[List[str] | str]
-    stream: Optional[bool] = Field()
-    model: str = Field()
-    # llama.cpp specific parameters
-    top_k: Optional[int]
-    repetition_penalty: Optional[float]
-    frequency_penalty: Optional[float]
-    last_n_tokens: Optional[int]
-    seed: Optional[int]
-    batch_size: Optional[int]
-    threads: Optional[int]
-
-    # ignored or currently unsupported
-    n: Any
-    logit_bias: Any
-    user: Any
-    presence_penalty: Any
-
-    class Config:
-        arbitrary_types_allowed = True
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index be7fa66..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-anyio==3.6.2
-blake3==0.3.3
-certifi==2023.7.22
-charset-normalizer==3.1.0
-click==8.1.3
-ctransformers==0.2.27
-fastapi==0.95.2
-filelock==3.12.0
-fsspec==2023.5.0
-h11==0.14.0
-huggingface-hub==0.14.1
-idna==3.4
-packaging==23.1
-pydantic==1.10.7
-PyYAML==6.0
-requests==2.31.0
-sniffio==1.3.0
-starlette==0.27.0
-tqdm==4.65.0
-typing_extensions==4.6.0
-urllib3==2.0.7
-uvicorn==0.22.0
diff --git a/response_body.py b/response_body.py
deleted file mode 100644
index aff3360..0000000
--- a/response_body.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from typing import Literal
-from pydantic import BaseModel
-
-
-class Message(BaseModel):
-    """_summary_
-
-    Args:
-        BaseModel (_type_): message in choice
-    """
-
-    role: Literal["system", "user", "assistant"]
-    content: str
-
-
-class Choice(BaseModel):
-    """_summary_
-
-    Args:
-        BaseModel (_type_): choice in completion response
-    """
-
-    index: int
-    text: str
-    logprobs: None
-    finish_reason: str
-
-
-class CompletionResponseBody(BaseModel):
-    """_summary_
-
-    Args:
-        BaseModel (_type_): response body for /chat/completions
-    """
-
-    id: str
-    object: str
-    created: int
-    choices: list[Choice]
-    usage: dict[str, int]
-
-
-class ChatChoice(BaseModel):
-    """_summary_
-
-    Args:
-        BaseModel (_type_): choice in completion response
-    """
-
-    index: int
-    message: Message
-    finish_reason: str
-
-
-class ChatCompletionResponseBody(BaseModel):
-    """_summary_
-
-    Args:
-        BaseModel (_type_): response body for /chat/completions
-    """
-
-    id: str
-    object: str
-    created: int
-    choices: list[ChatChoice]
-    usage: dict[str, int]
diff --git a/streamers.py b/streamers.py
deleted file mode 100644
index f4a4815..0000000
--- a/streamers.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import json
-from os import times
-from ctransformers import LLM, Config
-
-from log import log
-from get_env import get_env
-from const import DEFAULT_CONTEXT_LENGTH, DEFAULT_LOG_LEVEL
-
-
-def completions_streamer(
-    prompt: str,
-    model_name: str,
-    llm: LLM,
-    config: Config,
-):
-    """_summary_
-    returns a generator that yields a stream of responses
-    """
-    created = times()
-
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    log.debug("prompt: %s", prompt)
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
-    LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL)
-
-    log.debug("Streaming from ctransformer instance!")
-    total_tokens = 0
-    for token in llm(
-        prompt,
-        stream=True,
-        reset=True,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
-        stop=stop,
-    ):
-        if LOGGING_LEVEL == "DEBUG":
-            # Only track token length if we're in debug mode to avoid overhead
-            total_tokens = total_tokens + len(token)
-            # tokens are not necessarily characters, but this is a good enough approximation
-            if total_tokens > CONTEXT_LENGTH:
-                log.debug(
-                    "Total token length %s exceeded context length %s",
-                    total_tokens,
-                    CONTEXT_LENGTH,
-                )
-                log.debug(
-                    "Try to increase CONTEXT_LENGTH that is currently set to %s to your model's context length",
-                    CONTEXT_LENGTH,
-                )
-                log.debug(
-                    "Alternatively, increse REPETITION_PENALTY %s and LAST_N_TOKENS %s AND/OR adjust temperature %s top_k %s top_p %s",
-                    repetition_penalty,
-                    last_n_tokens,
-                    temperature,
-                    top_k,
-                    top_p,
-                )
-        log.debug("Streaming token %s", token)
-        data = json.dumps(
-            {
-                "id": "id",
-                "object": "text_completion.chunk",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": token,
-                        "index": 0,
-                        "finish_reason": None,
-                    }
-                ],
-            }
-        )
-        yield f"data: {data}" + "\n\n"
-
-    stop_data = json.dumps(
-        {
-            "id": "id",
-            "object": "text_completion.chunk",
-            "created": created,
-            "model": model_name,
-            "choices": [
-                {
-                    "text": "",
-                    "index": 0,
-                    "finish_reason": "stop",
-                }
-            ],
-        }
-    )
-    yield f"data: {stop_data}" + "\n\n"
-    log.debug("Streaming ended")
-
-
-def chat_completions_streamer(
-    prompt: str,
-    model_name: str,
-    llm: LLM,
-    config: Config,
-):
-    """_summary_
-    returns a generator that yields a stream of responses
-    """
-    created = times()
-
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    log.debug("prompt: %s", prompt)
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
-    LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL)
-
-    log.debug("Streaming from ctransformer instance")
-    total_tokens = 0
-    for token in llm(
-        prompt,
-        stream=True,
-        reset=True,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
-        stop=stop,
-    ):
-        if LOGGING_LEVEL == "DEBUG":
-            # Only track token length if we're in debug mode to avoid overhead
-            total_tokens = total_tokens + len(token)
-            # tokens are not necessarily characters, but this is a good enough approximation
-            if total_tokens > CONTEXT_LENGTH:
-                log.debug(
-                    "Total token length %s exceeded context length %s",
-                    total_tokens,
-                    CONTEXT_LENGTH,
-                )
-                log.debug(
-                    "Try to increase CONTEXT_LENGTH that is currently set to %s to your model's context length",
-                    CONTEXT_LENGTH,
-                )
-                log.debug(
-                    "Alternatively, increse REPETITION_PENALTY %s and LAST_N_TOKENS %s AND/OR adjust temperature %s top_k %s top_p %s",
-                    repetition_penalty,
-                    last_n_tokens,
-                    temperature,
-                    top_k,
-                    top_p,
-                )
-        log.debug("Streaming token %s", token)
-        data = json.dumps(
-            {
-                "id": "id",
-                "object": "chat.completion.chunk",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "delta": {"role": "assistant", "content": token},
-                        "index": 0,
-                        "finish_reason": None,
-                    }
-                ],
-            }
-        )
-        yield f"data: {data}" + "\n\n"
-
-    stop_data = json.dumps(
-        {
-            "id": "id",
-            "object": "chat.completion.chunk",
-            "created": created,
-            "model": model_name,
-            "choices": [
-                {
-                    "delta": {},
-                    "index": 0,
-                    "finish_reason": "stop",
-                }
-            ],
-        }
-    )
-    yield f"data: {stop_data}" + "\n\n"
-    log.debug("Streaming ended")
diff --git a/truncate.py b/truncate.py
deleted file mode 100644
index 49013a5..0000000
--- a/truncate.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from get_env import get_env_or_none
-
-def truncate(string, beginning=True):
-    """Shorten the given string to the given length.
-
-    :Parameters:
-        length (int) = The maximum allowed length before truncating.
-        beginning (bool) = Trim starting chars, else; ending.
-
-    :Return:
-        (str)
-
-    ex. call: truncate('12345678', 4)
-        returns: '5678'
-    """
-    TRUNCATE_PROMPT_LENGTH = get_env_or_none("TRUNCATE_PROMPT_LENGTH")
-    if (TRUNCATE_PROMPT_LENGTH is None):
-      return string
-    length = int(TRUNCATE_PROMPT_LENGTH)
-    if len(string) > length:
-        # trim starting chars.
-        if beginning:
-            string = string[-length:]
-        # trim ending chars.
-        else:
-            string = string[:length]
-    return string