♻️ big refactor, use llama server and openai python library

2024-09-16 18:15:27 +00:00 · 2023-10-11 10:31:47 +00:00 · 2023-10-11 10:31:47 +00:00 · 904dde744f
parent 541007380a
commit 904dde744f
16 changed files with 1215 additions and 914 deletions
--- a/.envrc
+++ b/.envrc
@ -1 +0,0 @@
-use flake
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 .direnv
+.venv
 result

 # https://github.com/github/gitignore/blob/main/Python.gitignore
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -1,7 +1,9 @@
 {
  "recommendations": [
-    "editorconfig.editorconfig",
    "charliermarsh.ruff",
+    "editorconfig.editorconfig",
+    "ms-python.black-formatter",
    "ms-python.python",
+    "tamasfe.even-better-toml",
  ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,30 +1,38 @@
 {
-  "python.analysis.typeCheckingMode": "basic",
-  "python.formatting.provider": "black",
+  // nice editor settings
  "editor.formatOnSave": true,
-  "python.linting.enabled": true,
-  "python.linting.lintOnSave": true,
-  "python.linting.mypyEnabled": true,
-  "python.linting.banditEnabled": true,
-  "python.languageServer": "Pylance",
-  "[python]": {
-    "editor.codeActionsOnSave": {
-      "source.organizeImports": true
-    }
-  },
-  "terminal.integrated.env.linux": {
-    "PYTHONPATH": "${workspaceFolder}"
-  },
+  "editor.formatOnPaste": true,
+  "editor.rulers": [
+    120
+  ],
+  // editorconfig redundancy
+  "files.insertFinalNewline": true,
+  "files.trimTrailingWhitespace": true,
+  // hide unimportant files/folders
  "files.exclude": {
+    // defaults
    "**/.git": true,
    "**/.svn": true,
    "**/.hg": true,
    "**/CVS": true,
    "**/.DS_Store": true,
    "**/Thumbs.db": true,
+    // annoying
    "**/__pycache__": true,
    "**/.mypy_cache": true,
-    "**/.direnv": true,
    "**/.ruff_cache": true,
+    "**/*.tmp": true,
+  },
+  // python settings
+  "python.analysis.typeCheckingMode": "basic", // get ready to be annoyed
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter",
+    "editor.codeActionsOnSave": {
+      "source.organizeImports.ruff": true,
+      "source.fixAll": true,
+    }
+  },
+  "terminal.integrated.env.linux": {
+    "PYTHONPATH": "${workspaceFolder}/src/",
  },
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,29 @@
+version: "3.8"
+services:
+  server:
+    build:
+      context: src/nio_llm_server/
+      dockerfile: Dockerfile
+    ports:
+      - 8000:8000
+    volumes:
+      - /home/laurent/.cache/huggingface/hub/:/root/.cache/huggingface/hub/
+    healthcheck:
+      test: ["CMD", "nc", "-z", "-v", "localhost", "8000"]
+    restart: unless-stopped
+
+  client:
+    build:
+      context: src/nio_llm/
+      dockerfile: Dockerfile
+    environment:
+      - NIOLLM_HOMESERVER=$NIOLLM_HOMESERVER
+      - NIOLLM_USERNAME=$NIOLLM_USERNAME
+      - NIOLLM_DEVICE_ID=$NIOLLM_DEVICE_ID
+      - NIOLLM_ROOM=$NIOLLM_ROOM
+      - NIOLLM_PASSWORD=$NIOLLM_PASSWORD
+      - NIOLLM_OPENAI_API_ENDPOINT=$NIOLLM_OPENAI_API_ENDPOINT
+    depends_on:
+      server:
+        condition: service_healthy
+    restart: unless-stopped
--- a/flake.lock
+++ b/flake.lock
@ -1,85 +0,0 @@
-{
-  "nodes": {
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1685518550,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1686501370,
-        "narHash": "sha256-G0WuM9fqTPRc2URKP9Lgi5nhZMqsfHGrdEbrLvAPJcg=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "75a5ebf473cd60148ba9aec0d219f72e5cf52519",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "poetry2nix": {
-      "inputs": {
-        "flake-utils": [
-          "flake-utils"
-        ],
-        "nixpkgs": [
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1686140708,
-        "narHash": "sha256-CKTahDFlhx07OQb4Afj+4/cNaxIWfxb8VGUlllUgoPY=",
-        "owner": "nix-community",
-        "repo": "poetry2nix",
-        "rev": "d91e2dd14caf4d09240bedf69a778c88f356ebda",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-community",
-        "repo": "poetry2nix",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs",
-        "poetry2nix": "poetry2nix"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
--- a/flake.nix
+++ b/flake.nix
@ -1,40 +0,0 @@
-{
-  description = "nio-llm";
-
-  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
-
-    poetry2nix = {
-      url = "github:nix-community/poetry2nix";
-      inputs = {
-        nixpkgs.follows = "nixpkgs";
-        flake-utils.follows = "flake-utils";
-      };
-    };
-  };
-
-  outputs = { nixpkgs, flake-utils, poetry2nix, ... }:
-    flake-utils.lib.eachDefaultSystem (system:
-      let
-        pkgs = import nixpkgs {
-          inherit system;
-          overlays = [ poetry2nix.overlay ];
-        };
-
-        pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
-          projectDir = ./.;
-          preferWheels = true;
-          python = pkgs.python311;
-        };
-      in {
-        packages.default = pkgs.poetry2nix.mkPoetryApplication {
-          projectDir = ./.;
-          preferWheels = true;
-          python = pkgs.python311;
-        };
-
-        devShells.default =
-          pkgs.mkShell { buildInputs = [ pythonEnv pkgs.poetry ]; };
-      });
-}
--- a/poetry.lock
+++ b/poetry.lock
--- a/poetry.toml
+++ b/poetry.toml
@ -0,0 +1,3 @@
+[virtualenvs]
+create = true
+in-project = true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ license = "MIT"
 name = "nio-llm"
 readme = "README.md"
 repository = "https://github.com/Laurent2916/nio-llm.git"
-version = "0.1.0"
+version = "1.0.0"

 [tool.poetry.scripts]
 nio-llm = "nio_llm.main:main"
@ -29,9 +29,21 @@ mypy = "^1.3.0"
 ruff = "^0.0.267"

 [tool.ruff]
+target-version = "py311"
 line-length = 120
+ignore-init-module-imports = true
+include = [
+  "*.py",              # regular python files
+  "*.pyi",             # python stub files
+  "*.ipynb",           # jupyter notebooks
+  "**/pyproject.toml", # python config files
+]
 ignore = [
  "G004",  # Logging statement uses f-string
+  "EM102", # Exception must not use an f-string literal, assign to variable first
+  "D100",  # Missing docstring in public module
+  "D104",  # Missing docstring in public package
+  "N812",  # Lowercase imported as non lowercase
 ]
 select = [
  "A",   # flake8-builtins
@ -47,6 +59,8 @@ select = [
  "N",   # pep8-naming
  "PIE", # flake8-pie
  "PTH", # flake8-use-pathlib
+  "TD",  # flake8-todo
+  "FIX", # flake8-fixme
  "RET", # flake8-return
  "RUF", # ruff
  "S",   # flake8-bandit
@ -59,26 +73,28 @@ select = [
 [tool.ruff.pydocstyle]
 convention = "google"

+[tool.ruff.isort]
+known-first-party = ["nio_llm"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = [
+  "F401", # Imported but unused
+]
+"src/aube/main.py" = [
+  "F401", # Imported but unused
+  "E402", # Module level import not at top of file
+]
+
+[tool.ruff.mccabe]
+max-complexity = 5 # C901
+
 [tool.black]
-include = '\.pyi?$'
-target-version = ["py311"]
-line-length = 120
 exclude = '''
 /(
  \.git
  \.venv
 )/
 '''
-
-[tool.isort]
-multi_line_output = 3
-profile = "black"
-
-[tool.mypy]
-python_version = "3.11"
-warn_return_any = true
-warn_unused_configs = true
-
-[build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
+include = '\.pyi?$'
+line-length = 120
+target-version = ["py311"]
--- a/src/nio_llm/Dockerfile
+++ b/src/nio_llm/Dockerfile
@ -0,0 +1,21 @@
+FROM python:3.9
+
+# Update and upgrade the existing packages
+RUN apt-get update && \
+    apt-get upgrade -y
+
+RUN mkdir /app
+WORKDIR /app
+
+# install python dependencies
+RUN pip install --upgrade pip
+RUN pip install \
+    jsonargparse[signatures] \
+    git+https://github.com/abetlen/llama-cpp-python.git \
+    matrix-nio \
+    openai \
+    rich
+
+COPY *.py /app/nio_llm/
+ENV PYTHONPATH=/app
+CMD ["/usr/local/bin/python", "/app/nio_llm"]
--- a/src/nio_llm/init.py
+++ b/src/nio_llm/init.py
--- a/src/nio_llm/main.py
+++ b/src/nio_llm/main.py
@ -2,9 +2,7 @@

 import asyncio
 import logging
-from pathlib import Path

-from huggingface_hub import hf_hub_download
 from jsonargparse import CLI
 from rich.logging import RichHandler

@ -15,53 +13,62 @@ logger = logging.getLogger("nio-llm.main")

 def main(
    room: str,
-    password: str,
    username: str,
-    preprompt: str,
+    password: str,
+    preprompt: str = "You are a helpful assistant in a multi-agent [matrix] conversation.",
    device_id: str = "nio-llm",
    homeserver: str = "https://matrix.org",
-    ggml_repoid: str = "TheBloke/stable-vicuna-13B-GGML",
-    ggml_filename: str = "stable-vicuna-13B.ggmlv3.q5_1.bin",
    sync_timeout: int = 30000,
+    openai_api_key: str = "osftw",
+    openai_api_endpoint: str = "http://localhost:8000/v1",
+    openai_temperature: float = 0,
+    openai_max_tokens: int = 256,
 ) -> None:
-    """Download llama model from HuggingFace and start the client.
+    """Instantiate and start the client.

    Args:
        room (`str`):
            The room to join.
-        password (`str`):
-            The password to log in with.
        username (`str`):
            The username to log in as.
-        device_id (`str`):
-            The device ID to use.
+        password (`str`):
+            The password to log in with.
        preprompt (`str`):
            The preprompt to use.
-        ggml_repoid (`str`, default `"TheBloke/stable-vicuna-13B-GGML"`):
-            The HuggingFace Hub repo ID to download the model from.
-        ggml_filename (`str`, default `"stable-vicuna-13B.ggmlv3.q5_1.bin"`):
-            The HuggingFace Hub filename to download the model from.
-        homeserver (`str`, default `"matrix.org"`):
-            The homeserver to connect to.
-        sync_timeout (`int`, default `30000`):
+            Defaults to `"You are a helpful assistant."`.
+        device_id (`str`):
+            The device ID to use.
+            Defaults to `"nio-llm"`.
+        homeserver (`str`):
+            The matrix homeserver to connect to.
+            Defaults to `"https://matrix.org"`.
+        sync_timeout (`int`):
            The timeout to use when syncing with the homeserver.
+            Defaults to `30000`.
+        openai_api_key (`str`):
+            The OpenAI API key to use.
+            Defaults to `"osftw"`.
+        openai_api_endpoint (`str`):
+            The OpenAI API endpoint to use.
+            Defaults to `"http://localhost:8000/v1"`.
+        openai_temperature (`float`):
+            The OpenAI temperature to use.
+            Defaults to `0`.
+        openai_max_tokens (`int`):
+            The OpenAI max tokens to use.
+            Defaults to `256`.
    """
-    # download the model
-    ggml_path = Path(
-        hf_hub_download(
-            repo_id=ggml_repoid,
-            filename=ggml_filename,
-        ),
-    )
-
    # create the client
    client = LLMClient(
        room=room,
        username=username,
        device_id=device_id,
-        ggml_path=ggml_path,
        preprompt=preprompt,
        homeserver=homeserver,
+        openai_api_key=openai_api_key,
+        openai_api_endpoint=openai_api_endpoint,
+        openai_temperature=openai_temperature,
+        openai_max_tokens=openai_max_tokens,
    )

    # start the client
@ -86,6 +93,6 @@ if __name__ == "__main__":
    CLI(
        components=main,
        as_positional=False,
-        env_prefix="NIO_LLM",
+        env_prefix="NIOLLM",
        default_env=True,
    )
--- a/src/nio_llm/client.py
+++ b/src/nio_llm/client.py
@ -1,18 +1,15 @@
-"""A Matrix client that uses Llama to respond to messages."""
-
 import logging
 import time
 from collections import deque
-from pathlib import Path

-from llama_cpp import Llama
+import openai
 from nio import AsyncClient, MatrixRoom, RoomMessageText

 logger = logging.getLogger("nio-llm.client")


 class LLMClient(AsyncClient):
-    """A Matrix client that uses Llama to respond to messages."""
+    """A Matrix client that uses llama.cpp to respond to messages."""

    def __init__(
        self,
@ -20,18 +17,33 @@ class LLMClient(AsyncClient):
        homeserver: str,
        device_id: str,
        preprompt: str,
-        ggml_path: Path,
        room: str,
-    ):
+        openai_api_key: str,
+        openai_api_endpoint: str,
+        openai_temperature: float,
+        openai_max_tokens: int,
+    ) -> None:
        """Create a new LLMClient instance.

        Args:
-            username (`str`): The username to log in as.
-            homeserver (`str`): The homeserver to connect to.
-            device_id (`str`): The device ID to use.
-            preprompt (`str`): The preprompt to use.
-            ggml_path (`Path`): The path to the GGML model.
-            room (`str`): The room to join.
+            username (`str`):
+                The username to log in as.
+            homeserver (`str`):
+                The homeserver to connect to.
+            device_id (`str`):
+                The device ID to use.
+            preprompt (`str`):
+                The preprompt to use.
+            room (`str`):
+                The room to join.
+            openai_api_key (`str`):
+                The OpenAI API key to use.
+            openai_api_endpoint (`str`):
+                The OpenAI API endpoint to use.
+            openai_temperature (`float`):
+                The OpenAI temperature to use.
+            openai_max_tokens (`int`):
+                The OpenAI max tokens to use.
        """
        self.uid = f"@{username}:{homeserver.removeprefix('https://')}"
        self.spawn_time = time.time() * 1000
@ -39,20 +51,19 @@ class LLMClient(AsyncClient):
        self.preprompt = preprompt
        self.room = room

-        # create the AsyncClient instance
+        # setup openai settings
+        openai.api_base = openai_api_endpoint
+        openai.api_key = openai_api_key
+        self.openai_temperature = openai_temperature
+        self.openai_max_tokens = openai_max_tokens
+
+        # create nio AsyncClient instance
        super().__init__(
            user=self.uid,
            homeserver=homeserver,
            device_id=device_id,
        )

-        # create the Llama instance
-        self.llm = Llama(
-            model_path=str(ggml_path),
-            n_threads=12,
-            n_ctx=512 + 128,
-        )
-
        # create message history queue
        self.history: deque[RoomMessageText] = deque(maxlen=10)

@ -63,8 +74,10 @@ class LLMClient(AsyncClient):
        """Process new messages as they come in.

        Args:
-            room (`MatrixRoom`): The room the message was sent in.
-            event (`RoomMessageText`): The message event.
+            room (`MatrixRoom`):
+                The room the message was sent in.
+            event (`RoomMessageText`):
+                The message event.
        """
        logger.debug(f"New RoomMessageText: {event.source}")

@ -93,6 +106,7 @@ class LLMClient(AsyncClient):

        # update history
        self.history.append(event)
+        logger.debug(f"Updated history: {self.history}")

        # ignore our own messages
        if event.sender == self.user:
@ -107,51 +121,46 @@ class LLMClient(AsyncClient):
            and f'<a href="https://matrix.to/#/{self.uid}">{self.username}</a>'
            in event.source["content"]["formatted_body"]
        ):
-            logger.debug("Ignoring message not directed at us.")
-            return
-
-        # generate prompt from message and history
-        history = "\n".join(f"<{message.sender}>: {message.body}" for message in self.history)
-        prompt = "\n".join([self.preprompt, history, f"<{self.uid}>:"])
-        tokens = self.llm.tokenize(str.encode(prompt))
-        logger.debug(f"Prompt:\n{prompt}")
-        logger.debug(f"Tokens: {len(tokens)}")
-
-        # ignore prompts that are too long
-        if len(tokens) > 512:
-            logger.debug("Prompt too long, skipping.")
-            await self.room_send(
-                room_id=self.room,
-                message_type="m.room.message",
-                content={
-                    "msgtype": "m.emote",
-                    "body": "reached prompt token limit",
-                },
-            )
+            logger.debug("Ignoring message not mentioning us.")
            return

        # enable typing indicator
        await self.room_typing(
            self.room,
            typing_state=True,
-            timeout=100000000,
+            timeout=30000,
        )
+        logger.debug("Enabled typing indicator.")

        # generate response using llama.cpp
-        senders = [f"<{message.sender}>" for message in self.history]
-        output = self.llm(
-            prompt,
-            max_tokens=128,
-            stop=[f"<{self.uid}>", "### Human", "### Assistant", *senders],
-            echo=True,
+        response = openai.ChatCompletion.create(
+            model="local-model",
+            messages=[
+                {
+                    "content": self.preprompt,
+                    "role": "system",
+                },
+                *[
+                    {
+                        "content": f"{message.sender}: {message.body}",
+                        "role": "assistant" if message.sender == self.uid else "user",
+                    }
+                    for message in self.history
+                ],
+            ],
+            stop=["<|im_end|>"],
+            temperature=self.openai_temperature,
+            max_tokens=self.openai_max_tokens,
        )
+        logger.debug(f"Generated response: {response}")

        # retreive the response
-        output = output["choices"][0]["text"]  # type: ignore
-        output = output.removeprefix(prompt).strip()
+        output = response["choices"][0]["message"]["content"]  # type: ignore
+        output = output.strip().removeprefix(f"{self.uid}:").strip()

        # disable typing indicator
        await self.room_typing(self.room, typing_state=False)
+        logger.debug("Disabled typing indicator.")

        # send the response
        await self.room_send(
@ -162,8 +171,9 @@ class LLMClient(AsyncClient):
                "body": output,
            },
        )
+        logger.debug(f"Sent response: {output}")

-    async def start(self, password, sync_timeout=30000):
+    async def start(self, password, sync_timeout=30000) -> None:
        """Start the client.

        Args:
--- a/src/nio_llm_server/Dockerfile
+++ b/src/nio_llm_server/Dockerfile
@ -0,0 +1,33 @@
+FROM python:3
+
+# Update and upgrade the existing packages
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+        ninja-build \
+        libopenblas-dev \
+        build-essential
+
+RUN mkdir /app
+WORKDIR /app
+
+# install python dependencies
+RUN pip install --upgrade pip
+RUN pip install huggingface_hub
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
+    pip install llama-cpp-python[server]@git+https://github.com/abetlen/llama-cpp-python.git --verbose
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV HF_REPO=TheBloke/Mistral-7B-OpenOrca-GGUF
+ENV HF_FILE=mistral-7b-openorca.Q4_K_M.gguf
+ENV MODEL_ALIAS=local-model
+ENV CHAT_FORMAT=chatml
+
+# Expose a port for the server
+EXPOSE 8000
+
+COPY run.sh /app
+# Run the server start script
+CMD ["/bin/sh", "/app/run.sh"]
--- a/src/nio_llm_server/run.sh
+++ b/src/nio_llm_server/run.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+huggingface-cli download $HF_REPO $HF_FILE
+MODEL_PATH=`huggingface-cli download --quiet $HF_REPO $HF_FILE`
+
+python3 -m llama_cpp.server --host $HOST --port $PORT --model $MODEL_PATH --model_alias $MODEL_ALIAS --chat_format $CHAT_FORMAT