♻️ big refactor, use llama server and openai python library

This commit is contained in:
Laureηt 2023-10-11 10:31:47 +00:00
parent 541007380a
commit 904dde744f
Signed by: Laurent
SSH key fingerprint: SHA256:kZEpW8cMJ54PDeCvOhzreNr4FSh6R13CMGH/POoO8DI
16 changed files with 1215 additions and 914 deletions

1
.envrc
View file

@ -1 +0,0 @@
use flake

1
.gitignore vendored
View file

@ -1,4 +1,5 @@
.direnv
.venv
result
# https://github.com/github/gitignore/blob/main/Python.gitignore

View file

@ -1,7 +1,9 @@
{
"recommendations": [
"editorconfig.editorconfig",
"charliermarsh.ruff",
"editorconfig.editorconfig",
"ms-python.black-formatter",
"ms-python.python",
"tamasfe.even-better-toml",
]
}

40
.vscode/settings.json vendored
View file

@ -1,30 +1,38 @@
{
"python.analysis.typeCheckingMode": "basic",
"python.formatting.provider": "black",
// nice editor settings
"editor.formatOnSave": true,
"python.linting.enabled": true,
"python.linting.lintOnSave": true,
"python.linting.mypyEnabled": true,
"python.linting.banditEnabled": true,
"python.languageServer": "Pylance",
"[python]": {
"editor.codeActionsOnSave": {
"source.organizeImports": true
}
},
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}"
},
"editor.formatOnPaste": true,
"editor.rulers": [
120
],
// editorconfig redundancy
"files.insertFinalNewline": true,
"files.trimTrailingWhitespace": true,
// hide unimportant files/folders
"files.exclude": {
// defaults
"**/.git": true,
"**/.svn": true,
"**/.hg": true,
"**/CVS": true,
"**/.DS_Store": true,
"**/Thumbs.db": true,
// annoying
"**/__pycache__": true,
"**/.mypy_cache": true,
"**/.direnv": true,
"**/.ruff_cache": true,
"**/*.tmp": true,
},
// python settings
"python.analysis.typeCheckingMode": "basic", // get ready to be annoyed
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.codeActionsOnSave": {
"source.organizeImports.ruff": true,
"source.fixAll": true,
}
},
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}/src/",
},
}

29
docker-compose.yml Normal file
View file

@ -0,0 +1,29 @@
version: "3.8"
services:
server:
build:
context: src/nio_llm_server/
dockerfile: Dockerfile
ports:
- 8000:8000
volumes:
- /home/laurent/.cache/huggingface/hub/:/root/.cache/huggingface/hub/
healthcheck:
test: ["CMD", "nc", "-z", "-v", "localhost", "8000"]
restart: unless-stopped
client:
build:
context: src/nio_llm/
dockerfile: Dockerfile
environment:
- NIOLLM_HOMESERVER=$NIOLLM_HOMESERVER
- NIOLLM_USERNAME=$NIOLLM_USERNAME
- NIOLLM_DEVICE_ID=$NIOLLM_DEVICE_ID
- NIOLLM_ROOM=$NIOLLM_ROOM
- NIOLLM_PASSWORD=$NIOLLM_PASSWORD
- NIOLLM_OPENAI_API_ENDPOINT=$NIOLLM_OPENAI_API_ENDPOINT
depends_on:
server:
condition: service_healthy
restart: unless-stopped

View file

@ -1,85 +0,0 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1685518550,
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1686501370,
"narHash": "sha256-G0WuM9fqTPRc2URKP9Lgi5nhZMqsfHGrdEbrLvAPJcg=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "75a5ebf473cd60148ba9aec0d219f72e5cf52519",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": [
"flake-utils"
],
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1686140708,
"narHash": "sha256-CKTahDFlhx07OQb4Afj+4/cNaxIWfxb8VGUlllUgoPY=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "d91e2dd14caf4d09240bedf69a778c88f356ebda",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

View file

@ -1,40 +0,0 @@
{
description = "nio-llm";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs = {
nixpkgs.follows = "nixpkgs";
flake-utils.follows = "flake-utils";
};
};
};
outputs = { nixpkgs, flake-utils, poetry2nix, ... }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system;
overlays = [ poetry2nix.overlay ];
};
pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
projectDir = ./.;
preferWheels = true;
python = pkgs.python311;
};
in {
packages.default = pkgs.poetry2nix.mkPoetryApplication {
projectDir = ./.;
preferWheels = true;
python = pkgs.python311;
};
devShells.default =
pkgs.mkShell { buildInputs = [ pythonEnv pkgs.poetry ]; };
});
}

1633
poetry.lock generated

File diff suppressed because it is too large Load diff

3
poetry.toml Normal file
View file

@ -0,0 +1,3 @@
[virtualenvs]
create = true
in-project = true

View file

@ -6,7 +6,7 @@ license = "MIT"
name = "nio-llm"
readme = "README.md"
repository = "https://github.com/Laurent2916/nio-llm.git"
version = "0.1.0"
version = "1.0.0"
[tool.poetry.scripts]
nio-llm = "nio_llm.main:main"
@ -29,9 +29,21 @@ mypy = "^1.3.0"
ruff = "^0.0.267"
[tool.ruff]
target-version = "py311"
line-length = 120
ignore-init-module-imports = true
include = [
"*.py", # regular python files
"*.pyi", # python stub files
"*.ipynb", # jupyter notebooks
"**/pyproject.toml", # python config files
]
ignore = [
"G004", # Logging statement uses f-string
"EM102", # Exception must not use an f-string literal, assign to variable first
"D100", # Missing docstring in public module
"D104", # Missing docstring in public package
"N812", # Lowercase imported as non lowercase
]
select = [
"A", # flake8-builtins
@ -47,6 +59,8 @@ select = [
"N", # pep8-naming
"PIE", # flake8-pie
"PTH", # flake8-use-pathlib
"TD", # flake8-todo
"FIX", # flake8-fixme
"RET", # flake8-return
"RUF", # ruff
"S", # flake8-bandit
@ -59,26 +73,28 @@ select = [
[tool.ruff.pydocstyle]
convention = "google"
[tool.ruff.isort]
known-first-party = ["nio_llm"]
[tool.ruff.per-file-ignores]
"__init__.py" = [
"F401", # Imported but unused
]
"src/aube/main.py" = [
"F401", # Imported but unused
"E402", # Module level import not at top of file
]
[tool.ruff.mccabe]
max-complexity = 5 # C901
[tool.black]
include = '\.pyi?$'
target-version = ["py311"]
line-length = 120
exclude = '''
/(
\.git
\.venv
)/
'''
[tool.isort]
multi_line_output = 3
profile = "black"
[tool.mypy]
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
include = '\.pyi?$'
line-length = 120
target-version = ["py311"]

21
src/nio_llm/Dockerfile Normal file
View file

@ -0,0 +1,21 @@
FROM python:3.9
# Update and upgrade the existing packages
RUN apt-get update && \
apt-get upgrade -y
RUN mkdir /app
WORKDIR /app
# install python dependencies
RUN pip install --upgrade pip
RUN pip install \
jsonargparse[signatures] \
git+https://github.com/abetlen/llama-cpp-python.git \
matrix-nio \
openai \
rich
COPY *.py /app/nio_llm/
ENV PYTHONPATH=/app
CMD ["/usr/local/bin/python", "/app/nio_llm"]

View file

@ -2,9 +2,7 @@
import asyncio
import logging
from pathlib import Path
from huggingface_hub import hf_hub_download
from jsonargparse import CLI
from rich.logging import RichHandler
@ -15,53 +13,62 @@ logger = logging.getLogger("nio-llm.main")
def main(
room: str,
password: str,
username: str,
preprompt: str,
password: str,
preprompt: str = "You are a helpful assistant in a multi-agent [matrix] conversation.",
device_id: str = "nio-llm",
homeserver: str = "https://matrix.org",
ggml_repoid: str = "TheBloke/stable-vicuna-13B-GGML",
ggml_filename: str = "stable-vicuna-13B.ggmlv3.q5_1.bin",
sync_timeout: int = 30000,
openai_api_key: str = "osftw",
openai_api_endpoint: str = "http://localhost:8000/v1",
openai_temperature: float = 0,
openai_max_tokens: int = 256,
) -> None:
"""Download llama model from HuggingFace and start the client.
"""Instantiate and start the client.
Args:
room (`str`):
The room to join.
password (`str`):
The password to log in with.
username (`str`):
The username to log in as.
device_id (`str`):
The device ID to use.
password (`str`):
The password to log in with.
preprompt (`str`):
The preprompt to use.
ggml_repoid (`str`, default `"TheBloke/stable-vicuna-13B-GGML"`):
The HuggingFace Hub repo ID to download the model from.
ggml_filename (`str`, default `"stable-vicuna-13B.ggmlv3.q5_1.bin"`):
The HuggingFace Hub filename to download the model from.
homeserver (`str`, default `"matrix.org"`):
The homeserver to connect to.
sync_timeout (`int`, default `30000`):
Defaults to `"You are a helpful assistant."`.
device_id (`str`):
The device ID to use.
Defaults to `"nio-llm"`.
homeserver (`str`):
The matrix homeserver to connect to.
Defaults to `"https://matrix.org"`.
sync_timeout (`int`):
The timeout to use when syncing with the homeserver.
Defaults to `30000`.
openai_api_key (`str`):
The OpenAI API key to use.
Defaults to `"osftw"`.
openai_api_endpoint (`str`):
The OpenAI API endpoint to use.
Defaults to `"http://localhost:8000/v1"`.
openai_temperature (`float`):
The OpenAI temperature to use.
Defaults to `0`.
openai_max_tokens (`int`):
The OpenAI max tokens to use.
Defaults to `256`.
"""
# download the model
ggml_path = Path(
hf_hub_download(
repo_id=ggml_repoid,
filename=ggml_filename,
),
)
# create the client
client = LLMClient(
room=room,
username=username,
device_id=device_id,
ggml_path=ggml_path,
preprompt=preprompt,
homeserver=homeserver,
openai_api_key=openai_api_key,
openai_api_endpoint=openai_api_endpoint,
openai_temperature=openai_temperature,
openai_max_tokens=openai_max_tokens,
)
# start the client
@ -86,6 +93,6 @@ if __name__ == "__main__":
CLI(
components=main,
as_positional=False,
env_prefix="NIO_LLM",
env_prefix="NIOLLM",
default_env=True,
)

View file

@ -1,18 +1,15 @@
"""A Matrix client that uses Llama to respond to messages."""
import logging
import time
from collections import deque
from pathlib import Path
from llama_cpp import Llama
import openai
from nio import AsyncClient, MatrixRoom, RoomMessageText
logger = logging.getLogger("nio-llm.client")
class LLMClient(AsyncClient):
"""A Matrix client that uses Llama to respond to messages."""
"""A Matrix client that uses llama.cpp to respond to messages."""
def __init__(
self,
@ -20,18 +17,33 @@ class LLMClient(AsyncClient):
homeserver: str,
device_id: str,
preprompt: str,
ggml_path: Path,
room: str,
):
openai_api_key: str,
openai_api_endpoint: str,
openai_temperature: float,
openai_max_tokens: int,
) -> None:
"""Create a new LLMClient instance.
Args:
username (`str`): The username to log in as.
homeserver (`str`): The homeserver to connect to.
device_id (`str`): The device ID to use.
preprompt (`str`): The preprompt to use.
ggml_path (`Path`): The path to the GGML model.
room (`str`): The room to join.
username (`str`):
The username to log in as.
homeserver (`str`):
The homeserver to connect to.
device_id (`str`):
The device ID to use.
preprompt (`str`):
The preprompt to use.
room (`str`):
The room to join.
openai_api_key (`str`):
The OpenAI API key to use.
openai_api_endpoint (`str`):
The OpenAI API endpoint to use.
openai_temperature (`float`):
The OpenAI temperature to use.
openai_max_tokens (`int`):
The OpenAI max tokens to use.
"""
self.uid = f"@{username}:{homeserver.removeprefix('https://')}"
self.spawn_time = time.time() * 1000
@ -39,20 +51,19 @@ class LLMClient(AsyncClient):
self.preprompt = preprompt
self.room = room
# create the AsyncClient instance
# setup openai settings
openai.api_base = openai_api_endpoint
openai.api_key = openai_api_key
self.openai_temperature = openai_temperature
self.openai_max_tokens = openai_max_tokens
# create nio AsyncClient instance
super().__init__(
user=self.uid,
homeserver=homeserver,
device_id=device_id,
)
# create the Llama instance
self.llm = Llama(
model_path=str(ggml_path),
n_threads=12,
n_ctx=512 + 128,
)
# create message history queue
self.history: deque[RoomMessageText] = deque(maxlen=10)
@ -63,8 +74,10 @@ class LLMClient(AsyncClient):
"""Process new messages as they come in.
Args:
room (`MatrixRoom`): The room the message was sent in.
event (`RoomMessageText`): The message event.
room (`MatrixRoom`):
The room the message was sent in.
event (`RoomMessageText`):
The message event.
"""
logger.debug(f"New RoomMessageText: {event.source}")
@ -93,6 +106,7 @@ class LLMClient(AsyncClient):
# update history
self.history.append(event)
logger.debug(f"Updated history: {self.history}")
# ignore our own messages
if event.sender == self.user:
@ -107,51 +121,46 @@ class LLMClient(AsyncClient):
and f'<a href="https://matrix.to/#/{self.uid}">{self.username}</a>'
in event.source["content"]["formatted_body"]
):
logger.debug("Ignoring message not directed at us.")
return
# generate prompt from message and history
history = "\n".join(f"<{message.sender}>: {message.body}" for message in self.history)
prompt = "\n".join([self.preprompt, history, f"<{self.uid}>:"])
tokens = self.llm.tokenize(str.encode(prompt))
logger.debug(f"Prompt:\n{prompt}")
logger.debug(f"Tokens: {len(tokens)}")
# ignore prompts that are too long
if len(tokens) > 512:
logger.debug("Prompt too long, skipping.")
await self.room_send(
room_id=self.room,
message_type="m.room.message",
content={
"msgtype": "m.emote",
"body": "reached prompt token limit",
},
)
logger.debug("Ignoring message not mentioning us.")
return
# enable typing indicator
await self.room_typing(
self.room,
typing_state=True,
timeout=100000000,
timeout=30000,
)
logger.debug("Enabled typing indicator.")
# generate response using llama.cpp
senders = [f"<{message.sender}>" for message in self.history]
output = self.llm(
prompt,
max_tokens=128,
stop=[f"<{self.uid}>", "### Human", "### Assistant", *senders],
echo=True,
response = openai.ChatCompletion.create(
model="local-model",
messages=[
{
"content": self.preprompt,
"role": "system",
},
*[
{
"content": f"{message.sender}: {message.body}",
"role": "assistant" if message.sender == self.uid else "user",
}
for message in self.history
],
],
stop=["<|im_end|>"],
temperature=self.openai_temperature,
max_tokens=self.openai_max_tokens,
)
logger.debug(f"Generated response: {response}")
# retreive the response
output = output["choices"][0]["text"] # type: ignore
output = output.removeprefix(prompt).strip()
output = response["choices"][0]["message"]["content"] # type: ignore
output = output.strip().removeprefix(f"{self.uid}:").strip()
# disable typing indicator
await self.room_typing(self.room, typing_state=False)
logger.debug("Disabled typing indicator.")
# send the response
await self.room_send(
@ -162,8 +171,9 @@ class LLMClient(AsyncClient):
"body": output,
},
)
logger.debug(f"Sent response: {output}")
async def start(self, password, sync_timeout=30000):
async def start(self, password, sync_timeout=30000) -> None:
"""Start the client.
Args:

View file

@ -0,0 +1,33 @@
FROM python:3
# Update and upgrade the existing packages
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install -y --no-install-recommends \
ninja-build \
libopenblas-dev \
build-essential
RUN mkdir /app
WORKDIR /app
# install python dependencies
RUN pip install --upgrade pip
RUN pip install huggingface_hub
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
pip install llama-cpp-python[server]@git+https://github.com/abetlen/llama-cpp-python.git --verbose
# Set environment variable for the host
ENV HOST=0.0.0.0
ENV PORT=8000
ENV HF_REPO=TheBloke/Mistral-7B-OpenOrca-GGUF
ENV HF_FILE=mistral-7b-openorca.Q4_K_M.gguf
ENV MODEL_ALIAS=local-model
ENV CHAT_FORMAT=chatml
# Expose a port for the server
EXPOSE 8000
COPY run.sh /app
# Run the server start script
CMD ["/bin/sh", "/app/run.sh"]

View file

@ -0,0 +1,6 @@
#!/bin/bash
huggingface-cli download $HF_REPO $HF_FILE
MODEL_PATH=`huggingface-cli download --quiet $HF_REPO $HF_FILE`
python3 -m llama_cpp.server --host $HOST --port $PORT --model $MODEL_PATH --model_alias $MODEL_ALIAS --chat_format $CHAT_FORMAT