♻️ big refactor, use llama server and openai python library

2024-11-21 13:48:48 +00:00 · 2023-10-11 10:31:47 +00:00 · 2023-10-11 10:31:47 +00:00 · 904dde744f
parent 541007380a
commit 904dde744f
16 changed files with 1215 additions and 914 deletions
--- a/.envrc
+++ b/.envrc
@ -1 +0,0 @@
 use flake
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 .direnv
 .venv
 result
 # https://github.com/github/gitignore/blob/main/Python.gitignore
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -1,7 +1,9 @@
 {
  "recommendations": [
    "editorconfig.editorconfig",
    "charliermarsh.ruff",
    "editorconfig.editorconfig",
    "ms-python.black-formatter",
    "ms-python.python",
    "tamasfe.even-better-toml",
  ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,30 +1,38 @@
 {
-  "python.analysis.typeCheckingMode": "basic",
+  // nice editor settings
  "python.formatting.provider": "black",
  "editor.formatOnSave": true,
-  "python.linting.enabled": true,
+  "editor.formatOnPaste": true,
-  "python.linting.lintOnSave": true,
+  "editor.rulers": [
-  "python.linting.mypyEnabled": true,
+    120
-  "python.linting.banditEnabled": true,
+  ],
-  "python.languageServer": "Pylance",
+  // editorconfig redundancy
-  "[python]": {
+  "files.insertFinalNewline": true,
-    "editor.codeActionsOnSave": {
+  "files.trimTrailingWhitespace": true,
-      "source.organizeImports": true
+  // hide unimportant files/folders
    }
  },
  "terminal.integrated.env.linux": {
    "PYTHONPATH": "${workspaceFolder}"
  },
  "files.exclude": {
    // defaults
    "**/.git": true,
    "**/.svn": true,
    "**/.hg": true,
    "**/CVS": true,
    "**/.DS_Store": true,
    "**/Thumbs.db": true,
    // annoying
    "**/__pycache__": true,
    "**/.mypy_cache": true,
    "**/.direnv": true,
    "**/.ruff_cache": true,
    "**/*.tmp": true,
  },
  // python settings
  "python.analysis.typeCheckingMode": "basic", // get ready to be annoyed
  "[python]": {
    "editor.defaultFormatter": "ms-python.black-formatter",
    "editor.codeActionsOnSave": {
      "source.organizeImports.ruff": true,
      "source.fixAll": true,
    }
  },
  "terminal.integrated.env.linux": {
    "PYTHONPATH": "${workspaceFolder}/src/",
  },
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,29 @@
 version: "3.8"
 services:
  server:
    build:
      context: src/nio_llm_server/
      dockerfile: Dockerfile
    ports:
      - 8000:8000
    volumes:
      - /home/laurent/.cache/huggingface/hub/:/root/.cache/huggingface/hub/
    healthcheck:
      test: ["CMD", "nc", "-z", "-v", "localhost", "8000"]
    restart: unless-stopped
  client:
    build:
      context: src/nio_llm/
      dockerfile: Dockerfile
    environment:
      - NIOLLM_HOMESERVER=$NIOLLM_HOMESERVER
      - NIOLLM_USERNAME=$NIOLLM_USERNAME
      - NIOLLM_DEVICE_ID=$NIOLLM_DEVICE_ID
      - NIOLLM_ROOM=$NIOLLM_ROOM
      - NIOLLM_PASSWORD=$NIOLLM_PASSWORD
      - NIOLLM_OPENAI_API_ENDPOINT=$NIOLLM_OPENAI_API_ENDPOINT
    depends_on:
      server:
        condition: service_healthy
    restart: unless-stopped
--- a/flake.lock
+++ b/flake.lock
@ -1,85 +0,0 @@
 {
  "nodes": {
    "flake-utils": {
      "inputs": {
        "systems": "systems"
      },
      "locked": {
        "lastModified": 1685518550,
        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1686501370,
        "narHash": "sha256-G0WuM9fqTPRc2URKP9Lgi5nhZMqsfHGrdEbrLvAPJcg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "75a5ebf473cd60148ba9aec0d219f72e5cf52519",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "poetry2nix": {
      "inputs": {
        "flake-utils": [
          "flake-utils"
        ],
        "nixpkgs": [
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1686140708,
        "narHash": "sha256-CKTahDFlhx07OQb4Afj+4/cNaxIWfxb8VGUlllUgoPY=",
        "owner": "nix-community",
        "repo": "poetry2nix",
        "rev": "d91e2dd14caf4d09240bedf69a778c88f356ebda",
        "type": "github"
      },
      "original": {
        "owner": "nix-community",
        "repo": "poetry2nix",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs",
        "poetry2nix": "poetry2nix"
      }
    },
    "systems": {
      "locked": {
        "lastModified": 1681028828,
        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
        "owner": "nix-systems",
        "repo": "default",
        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
        "type": "github"
      },
      "original": {
        "owner": "nix-systems",
        "repo": "default",
        "type": "github"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -1,40 +0,0 @@
 {
  description = "nio-llm";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
    flake-utils.url = "github:numtide/flake-utils";
    poetry2nix = {
      url = "github:nix-community/poetry2nix";
      inputs = {
        nixpkgs.follows = "nixpkgs";
        flake-utils.follows = "flake-utils";
      };
    };
  };
  outputs = { nixpkgs, flake-utils, poetry2nix, ... }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
          overlays = [ poetry2nix.overlay ];
        };
        pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
          projectDir = ./.;
          preferWheels = true;
          python = pkgs.python311;
        };
      in {
        packages.default = pkgs.poetry2nix.mkPoetryApplication {
          projectDir = ./.;
          preferWheels = true;
          python = pkgs.python311;
        };
        devShells.default =
          pkgs.mkShell { buildInputs = [ pythonEnv pkgs.poetry ]; };
      });
 }
--- a/poetry.lock
+++ b/poetry.lock
--- a/poetry.toml
+++ b/poetry.toml
@ -0,0 +1,3 @@
 [virtualenvs]
 create = true
 in-project = true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ license = "MIT"
 name = "nio-llm"
 readme = "README.md"
 repository = "https://github.com/Laurent2916/nio-llm.git"
-version = "0.1.0"
+version = "1.0.0"
 [tool.poetry.scripts]
 nio-llm = "nio_llm.main:main"
@ -29,9 +29,21 @@ mypy = "^1.3.0"
 ruff = "^0.0.267"
 [tool.ruff]
 target-version = "py311"
 line-length = 120
 ignore-init-module-imports = true
 include = [
  "*.py",              # regular python files
  "*.pyi",             # python stub files
  "*.ipynb",           # jupyter notebooks
  "**/pyproject.toml", # python config files
 ]
 ignore = [
  "G004",  # Logging statement uses f-string
  "EM102", # Exception must not use an f-string literal, assign to variable first
  "D100",  # Missing docstring in public module
  "D104",  # Missing docstring in public package
  "N812",  # Lowercase imported as non lowercase
 ]
 select = [
  "A",   # flake8-builtins
@ -47,6 +59,8 @@ select = [
  "N",   # pep8-naming
  "PIE", # flake8-pie
  "PTH", # flake8-use-pathlib
  "TD",  # flake8-todo
  "FIX", # flake8-fixme
  "RET", # flake8-return
  "RUF", # ruff
  "S",   # flake8-bandit
@ -59,26 +73,28 @@ select = [
 [tool.ruff.pydocstyle]
 convention = "google"
 [tool.ruff.isort]
 known-first-party = ["nio_llm"]
 [tool.ruff.per-file-ignores]
 "__init__.py" = [
  "F401", # Imported but unused
 ]
 "src/aube/main.py" = [
  "F401", # Imported but unused
  "E402", # Module level import not at top of file
 ]
 [tool.ruff.mccabe]
 max-complexity = 5 # C901
 [tool.black]
 include = '\.pyi?$'
 target-version = ["py311"]
 line-length = 120
 exclude = '''
 /(
  \.git
  \.venv
 )/
 '''
-
+include = '\.pyi?$'
-[tool.isort]
+line-length = 120
-multi_line_output = 3
+target-version = ["py311"]
 profile = "black"
 [tool.mypy]
 python_version = "3.11"
 warn_return_any = true
 warn_unused_configs = true
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/src/nio_llm/Dockerfile
+++ b/src/nio_llm/Dockerfile
@ -0,0 +1,21 @@
 FROM python:3.9
 # Update and upgrade the existing packages
 RUN apt-get update && \
    apt-get upgrade -y
 RUN mkdir /app
 WORKDIR /app
 # install python dependencies
 RUN pip install --upgrade pip
 RUN pip install \
    jsonargparse[signatures] \
    git+https://github.com/abetlen/llama-cpp-python.git \
    matrix-nio \
    openai \
    rich
 COPY *.py /app/nio_llm/
 ENV PYTHONPATH=/app
 CMD ["/usr/local/bin/python", "/app/nio_llm"]
--- a/src/nio_llm/init.py
+++ b/src/nio_llm/init.py
--- a/src/nio_llm/main.py
+++ b/src/nio_llm/main.py
@ -2,9 +2,7 @@
 import asyncio
 import logging
 from pathlib import Path
 from huggingface_hub import hf_hub_download
 from jsonargparse import CLI
 from rich.logging import RichHandler
@ -15,53 +13,62 @@ logger = logging.getLogger("nio-llm.main")
 def main(
    room: str,
    password: str,
    username: str,
-    preprompt: str,
+    password: str,
    preprompt: str = "You are a helpful assistant in a multi-agent [matrix] conversation.",
    device_id: str = "nio-llm",
    homeserver: str = "https://matrix.org",
    ggml_repoid: str = "TheBloke/stable-vicuna-13B-GGML",
    ggml_filename: str = "stable-vicuna-13B.ggmlv3.q5_1.bin",
    sync_timeout: int = 30000,
    openai_api_key: str = "osftw",
    openai_api_endpoint: str = "http://localhost:8000/v1",
    openai_temperature: float = 0,
    openai_max_tokens: int = 256,
 ) -> None:
-    """Download llama model from HuggingFace and start the client.
+    """Instantiate and start the client.
    Args:
        room (`str`):
            The room to join.
        password (`str`):
            The password to log in with.
        username (`str`):
            The username to log in as.
-        device_id (`str`):
+        password (`str`):
-            The device ID to use.
+            The password to log in with.
        preprompt (`str`):
            The preprompt to use.
-        ggml_repoid (`str`, default `"TheBloke/stable-vicuna-13B-GGML"`):
+            Defaults to `"You are a helpful assistant."`.
-            The HuggingFace Hub repo ID to download the model from.
+        device_id (`str`):
-        ggml_filename (`str`, default `"stable-vicuna-13B.ggmlv3.q5_1.bin"`):
+            The device ID to use.
-            The HuggingFace Hub filename to download the model from.
+            Defaults to `"nio-llm"`.
-        homeserver (`str`, default `"matrix.org"`):
+        homeserver (`str`):
-            The homeserver to connect to.
+            The matrix homeserver to connect to.
-        sync_timeout (`int`, default `30000`):
+            Defaults to `"https://matrix.org"`.
        sync_timeout (`int`):
            The timeout to use when syncing with the homeserver.
            Defaults to `30000`.
        openai_api_key (`str`):
            The OpenAI API key to use.
            Defaults to `"osftw"`.
        openai_api_endpoint (`str`):
            The OpenAI API endpoint to use.
            Defaults to `"http://localhost:8000/v1"`.
        openai_temperature (`float`):
            The OpenAI temperature to use.
            Defaults to `0`.
        openai_max_tokens (`int`):
            The OpenAI max tokens to use.
            Defaults to `256`.
    """
    # download the model
    ggml_path = Path(
        hf_hub_download(
            repo_id=ggml_repoid,
            filename=ggml_filename,
        ),
    )
    # create the client
    client = LLMClient(
        room=room,
        username=username,
        device_id=device_id,
        ggml_path=ggml_path,
        preprompt=preprompt,
        homeserver=homeserver,
        openai_api_key=openai_api_key,
        openai_api_endpoint=openai_api_endpoint,
        openai_temperature=openai_temperature,
        openai_max_tokens=openai_max_tokens,
    )
    # start the client
@ -86,6 +93,6 @@ if __name__ == "__main__":
    CLI(
        components=main,
        as_positional=False,
-        env_prefix="NIO_LLM",
+        env_prefix="NIOLLM",
        default_env=True,
    )
--- a/src/nio_llm/client.py
+++ b/src/nio_llm/client.py
@ -1,18 +1,15 @@
 """A Matrix client that uses Llama to respond to messages."""
 import logging
 import time
 from collections import deque
 from pathlib import Path
-from llama_cpp import Llama
+import openai
 from nio import AsyncClient, MatrixRoom, RoomMessageText
 logger = logging.getLogger("nio-llm.client")
 class LLMClient(AsyncClient):
-    """A Matrix client that uses Llama to respond to messages."""
+    """A Matrix client that uses llama.cpp to respond to messages."""
    def __init__(
        self,
@ -20,18 +17,33 @@ class LLMClient(AsyncClient):
        homeserver: str,
        device_id: str,
        preprompt: str,
        ggml_path: Path,
        room: str,
-    ):
+        openai_api_key: str,
        openai_api_endpoint: str,
        openai_temperature: float,
        openai_max_tokens: int,
    ) -> None:
        """Create a new LLMClient instance.
        Args:
-            username (`str`): The username to log in as.
+            username (`str`):
-            homeserver (`str`): The homeserver to connect to.
+                The username to log in as.
-            device_id (`str`): The device ID to use.
+            homeserver (`str`):
-            preprompt (`str`): The preprompt to use.
+                The homeserver to connect to.
-            ggml_path (`Path`): The path to the GGML model.
+            device_id (`str`):
-            room (`str`): The room to join.
+                The device ID to use.
            preprompt (`str`):
                The preprompt to use.
            room (`str`):
                The room to join.
            openai_api_key (`str`):
                The OpenAI API key to use.
            openai_api_endpoint (`str`):
                The OpenAI API endpoint to use.
            openai_temperature (`float`):
                The OpenAI temperature to use.
            openai_max_tokens (`int`):
                The OpenAI max tokens to use.
        """
        self.uid = f"@{username}:{homeserver.removeprefix('https://')}"
        self.spawn_time = time.time() * 1000
@ -39,20 +51,19 @@ class LLMClient(AsyncClient):
        self.preprompt = preprompt
        self.room = room
-        # create the AsyncClient instance
+        # setup openai settings
        openai.api_base = openai_api_endpoint
        openai.api_key = openai_api_key
        self.openai_temperature = openai_temperature
        self.openai_max_tokens = openai_max_tokens
        # create nio AsyncClient instance
        super().__init__(
            user=self.uid,
            homeserver=homeserver,
            device_id=device_id,
        )
        # create the Llama instance
        self.llm = Llama(
            model_path=str(ggml_path),
            n_threads=12,
            n_ctx=512 + 128,
        )
        # create message history queue
        self.history: deque[RoomMessageText] = deque(maxlen=10)
@ -63,8 +74,10 @@ class LLMClient(AsyncClient):
        """Process new messages as they come in.
        Args:
-            room (`MatrixRoom`): The room the message was sent in.
+            room (`MatrixRoom`):
-            event (`RoomMessageText`): The message event.
+                The room the message was sent in.
            event (`RoomMessageText`):
                The message event.
        """
        logger.debug(f"New RoomMessageText: {event.source}")
@ -93,6 +106,7 @@ class LLMClient(AsyncClient):
        # update history
        self.history.append(event)
        logger.debug(f"Updated history: {self.history}")
        # ignore our own messages
        if event.sender == self.user:
@ -107,51 +121,46 @@ class LLMClient(AsyncClient):
            and f'<a href="https://matrix.to/#/{self.uid}">{self.username}</a>'
            in event.source["content"]["formatted_body"]
        ):
-            logger.debug("Ignoring message not directed at us.")
+            logger.debug("Ignoring message not mentioning us.")
            return
        # generate prompt from message and history
        history = "\n".join(f"<{message.sender}>: {message.body}" for message in self.history)
        prompt = "\n".join([self.preprompt, history, f"<{self.uid}>:"])
        tokens = self.llm.tokenize(str.encode(prompt))
        logger.debug(f"Prompt:\n{prompt}")
        logger.debug(f"Tokens: {len(tokens)}")
        # ignore prompts that are too long
        if len(tokens) > 512:
            logger.debug("Prompt too long, skipping.")
            await self.room_send(
                room_id=self.room,
                message_type="m.room.message",
                content={
                    "msgtype": "m.emote",
                    "body": "reached prompt token limit",
                },
            )
            return
        # enable typing indicator
        await self.room_typing(
            self.room,
            typing_state=True,
-            timeout=100000000,
+            timeout=30000,
        )
        logger.debug("Enabled typing indicator.")
        # generate response using llama.cpp
-        senders = [f"<{message.sender}>" for message in self.history]
+        response = openai.ChatCompletion.create(
-        output = self.llm(
+            model="local-model",
-            prompt,
+            messages=[
-            max_tokens=128,
+                {
-            stop=[f"<{self.uid}>", "### Human", "### Assistant", *senders],
+                    "content": self.preprompt,
-            echo=True,
+                    "role": "system",
                },
                *[
                    {
                        "content": f"{message.sender}: {message.body}",
                        "role": "assistant" if message.sender == self.uid else "user",
                    }
                    for message in self.history
                ],
            ],
            stop=["<|im_end|>"],
            temperature=self.openai_temperature,
            max_tokens=self.openai_max_tokens,
        )
        logger.debug(f"Generated response: {response}")
        # retreive the response
-        output = output["choices"][0]["text"]  # type: ignore
+        output = response["choices"][0]["message"]["content"]  # type: ignore
-        output = output.removeprefix(prompt).strip()
+        output = output.strip().removeprefix(f"{self.uid}:").strip()
        # disable typing indicator
        await self.room_typing(self.room, typing_state=False)
        logger.debug("Disabled typing indicator.")
        # send the response
        await self.room_send(
@ -162,8 +171,9 @@ class LLMClient(AsyncClient):
                "body": output,
            },
        )
        logger.debug(f"Sent response: {output}")
-    async def start(self, password, sync_timeout=30000):
+    async def start(self, password, sync_timeout=30000) -> None:
        """Start the client.
        Args:
--- a/src/nio_llm_server/Dockerfile
+++ b/src/nio_llm_server/Dockerfile
@ -0,0 +1,33 @@
 FROM python:3
 # Update and upgrade the existing packages
 RUN apt-get update && \
    apt-get upgrade -y && \
    apt-get install -y --no-install-recommends \
        ninja-build \
        libopenblas-dev \
        build-essential
 RUN mkdir /app
 WORKDIR /app
 # install python dependencies
 RUN pip install --upgrade pip
 RUN pip install huggingface_hub
 RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
    pip install llama-cpp-python[server]@git+https://github.com/abetlen/llama-cpp-python.git --verbose
 # Set environment variable for the host
 ENV HOST=0.0.0.0
 ENV PORT=8000
 ENV HF_REPO=TheBloke/Mistral-7B-OpenOrca-GGUF
 ENV HF_FILE=mistral-7b-openorca.Q4_K_M.gguf
 ENV MODEL_ALIAS=local-model
 ENV CHAT_FORMAT=chatml
 # Expose a port for the server
 EXPOSE 8000
 COPY run.sh /app
 # Run the server start script
 CMD ["/bin/sh", "/app/run.sh"]
--- a/src/nio_llm_server/run.sh
+++ b/src/nio_llm_server/run.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 huggingface-cli download $HF_REPO $HF_FILE
 MODEL_PATH=`huggingface-cli download --quiet $HF_REPO $HF_FILE`
 python3 -m llama_cpp.server --host $HOST --port $PORT --model $MODEL_PATH --model_alias $MODEL_ALIAS --chat_format $CHAT_FORMAT