♻️ big refactor, use llama server and openai python library

This commit is contained in:
Laureηt 2023-10-11 10:31:47 +00:00
parent 541007380a
commit 904dde744f
Signed by: Laurent
SSH key fingerprint: SHA256:kZEpW8cMJ54PDeCvOhzreNr4FSh6R13CMGH/POoO8DI
16 changed files with 1215 additions and 914 deletions

1
.envrc
View file

@ -1 +0,0 @@
use flake

1
.gitignore vendored
View file

@ -1,4 +1,5 @@
.direnv .direnv
.venv
result result
# https://github.com/github/gitignore/blob/main/Python.gitignore # https://github.com/github/gitignore/blob/main/Python.gitignore

View file

@ -1,7 +1,9 @@
{ {
"recommendations": [ "recommendations": [
"editorconfig.editorconfig",
"charliermarsh.ruff", "charliermarsh.ruff",
"editorconfig.editorconfig",
"ms-python.black-formatter",
"ms-python.python", "ms-python.python",
"tamasfe.even-better-toml",
] ]
} }

40
.vscode/settings.json vendored
View file

@ -1,30 +1,38 @@
{ {
"python.analysis.typeCheckingMode": "basic", // nice editor settings
"python.formatting.provider": "black",
"editor.formatOnSave": true, "editor.formatOnSave": true,
"python.linting.enabled": true, "editor.formatOnPaste": true,
"python.linting.lintOnSave": true, "editor.rulers": [
"python.linting.mypyEnabled": true, 120
"python.linting.banditEnabled": true, ],
"python.languageServer": "Pylance", // editorconfig redundancy
"[python]": { "files.insertFinalNewline": true,
"editor.codeActionsOnSave": { "files.trimTrailingWhitespace": true,
"source.organizeImports": true // hide unimportant files/folders
}
},
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}"
},
"files.exclude": { "files.exclude": {
// defaults
"**/.git": true, "**/.git": true,
"**/.svn": true, "**/.svn": true,
"**/.hg": true, "**/.hg": true,
"**/CVS": true, "**/CVS": true,
"**/.DS_Store": true, "**/.DS_Store": true,
"**/Thumbs.db": true, "**/Thumbs.db": true,
// annoying
"**/__pycache__": true, "**/__pycache__": true,
"**/.mypy_cache": true, "**/.mypy_cache": true,
"**/.direnv": true,
"**/.ruff_cache": true, "**/.ruff_cache": true,
"**/*.tmp": true,
},
// python settings
"python.analysis.typeCheckingMode": "basic", // get ready to be annoyed
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.codeActionsOnSave": {
"source.organizeImports.ruff": true,
"source.fixAll": true,
}
},
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}/src/",
}, },
} }

29
docker-compose.yml Normal file
View file

@ -0,0 +1,29 @@
version: "3.8"
services:
server:
build:
context: src/nio_llm_server/
dockerfile: Dockerfile
ports:
- 8000:8000
volumes:
- /home/laurent/.cache/huggingface/hub/:/root/.cache/huggingface/hub/
healthcheck:
test: ["CMD", "nc", "-z", "-v", "localhost", "8000"]
restart: unless-stopped
client:
build:
context: src/nio_llm/
dockerfile: Dockerfile
environment:
- NIOLLM_HOMESERVER=$NIOLLM_HOMESERVER
- NIOLLM_USERNAME=$NIOLLM_USERNAME
- NIOLLM_DEVICE_ID=$NIOLLM_DEVICE_ID
- NIOLLM_ROOM=$NIOLLM_ROOM
- NIOLLM_PASSWORD=$NIOLLM_PASSWORD
- NIOLLM_OPENAI_API_ENDPOINT=$NIOLLM_OPENAI_API_ENDPOINT
depends_on:
server:
condition: service_healthy
restart: unless-stopped

View file

@ -1,85 +0,0 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1685518550,
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1686501370,
"narHash": "sha256-G0WuM9fqTPRc2URKP9Lgi5nhZMqsfHGrdEbrLvAPJcg=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "75a5ebf473cd60148ba9aec0d219f72e5cf52519",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": [
"flake-utils"
],
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1686140708,
"narHash": "sha256-CKTahDFlhx07OQb4Afj+4/cNaxIWfxb8VGUlllUgoPY=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "d91e2dd14caf4d09240bedf69a778c88f356ebda",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

View file

@ -1,40 +0,0 @@
{
description = "nio-llm";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs = {
nixpkgs.follows = "nixpkgs";
flake-utils.follows = "flake-utils";
};
};
};
outputs = { nixpkgs, flake-utils, poetry2nix, ... }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system;
overlays = [ poetry2nix.overlay ];
};
pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
projectDir = ./.;
preferWheels = true;
python = pkgs.python311;
};
in {
packages.default = pkgs.poetry2nix.mkPoetryApplication {
projectDir = ./.;
preferWheels = true;
python = pkgs.python311;
};
devShells.default =
pkgs.mkShell { buildInputs = [ pythonEnv pkgs.poetry ]; };
});
}

1633
poetry.lock generated

File diff suppressed because it is too large Load diff

3
poetry.toml Normal file
View file

@ -0,0 +1,3 @@
[virtualenvs]
create = true
in-project = true

View file

@ -6,7 +6,7 @@ license = "MIT"
name = "nio-llm" name = "nio-llm"
readme = "README.md" readme = "README.md"
repository = "https://github.com/Laurent2916/nio-llm.git" repository = "https://github.com/Laurent2916/nio-llm.git"
version = "0.1.0" version = "1.0.0"
[tool.poetry.scripts] [tool.poetry.scripts]
nio-llm = "nio_llm.main:main" nio-llm = "nio_llm.main:main"
@ -29,9 +29,21 @@ mypy = "^1.3.0"
ruff = "^0.0.267" ruff = "^0.0.267"
[tool.ruff] [tool.ruff]
target-version = "py311"
line-length = 120 line-length = 120
ignore-init-module-imports = true
include = [
"*.py", # regular python files
"*.pyi", # python stub files
"*.ipynb", # jupyter notebooks
"**/pyproject.toml", # python config files
]
ignore = [ ignore = [
"G004", # Logging statement uses f-string "G004", # Logging statement uses f-string
"EM102", # Exception must not use an f-string literal, assign to variable first
"D100", # Missing docstring in public module
"D104", # Missing docstring in public package
"N812", # Lowercase imported as non lowercase
] ]
select = [ select = [
"A", # flake8-builtins "A", # flake8-builtins
@ -47,6 +59,8 @@ select = [
"N", # pep8-naming "N", # pep8-naming
"PIE", # flake8-pie "PIE", # flake8-pie
"PTH", # flake8-use-pathlib "PTH", # flake8-use-pathlib
"TD", # flake8-todo
"FIX", # flake8-fixme
"RET", # flake8-return "RET", # flake8-return
"RUF", # ruff "RUF", # ruff
"S", # flake8-bandit "S", # flake8-bandit
@ -59,26 +73,28 @@ select = [
[tool.ruff.pydocstyle] [tool.ruff.pydocstyle]
convention = "google" convention = "google"
[tool.ruff.isort]
known-first-party = ["nio_llm"]
[tool.ruff.per-file-ignores]
"__init__.py" = [
"F401", # Imported but unused
]
"src/aube/main.py" = [
"F401", # Imported but unused
"E402", # Module level import not at top of file
]
[tool.ruff.mccabe]
max-complexity = 5 # C901
[tool.black] [tool.black]
include = '\.pyi?$'
target-version = ["py311"]
line-length = 120
exclude = ''' exclude = '''
/( /(
\.git \.git
\.venv \.venv
)/ )/
''' '''
include = '\.pyi?$'
[tool.isort] line-length = 120
multi_line_output = 3 target-version = ["py311"]
profile = "black"
[tool.mypy]
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

21
src/nio_llm/Dockerfile Normal file
View file

@ -0,0 +1,21 @@
FROM python:3.9
# Update and upgrade the existing packages
RUN apt-get update && \
apt-get upgrade -y
RUN mkdir /app
WORKDIR /app
# install python dependencies
RUN pip install --upgrade pip
RUN pip install \
jsonargparse[signatures] \
git+https://github.com/abetlen/llama-cpp-python.git \
matrix-nio \
openai \
rich
COPY *.py /app/nio_llm/
ENV PYTHONPATH=/app
CMD ["/usr/local/bin/python", "/app/nio_llm"]

View file

@ -2,9 +2,7 @@
import asyncio import asyncio
import logging import logging
from pathlib import Path
from huggingface_hub import hf_hub_download
from jsonargparse import CLI from jsonargparse import CLI
from rich.logging import RichHandler from rich.logging import RichHandler
@ -15,53 +13,62 @@ logger = logging.getLogger("nio-llm.main")
def main( def main(
room: str, room: str,
password: str,
username: str, username: str,
preprompt: str, password: str,
preprompt: str = "You are a helpful assistant in a multi-agent [matrix] conversation.",
device_id: str = "nio-llm", device_id: str = "nio-llm",
homeserver: str = "https://matrix.org", homeserver: str = "https://matrix.org",
ggml_repoid: str = "TheBloke/stable-vicuna-13B-GGML",
ggml_filename: str = "stable-vicuna-13B.ggmlv3.q5_1.bin",
sync_timeout: int = 30000, sync_timeout: int = 30000,
openai_api_key: str = "osftw",
openai_api_endpoint: str = "http://localhost:8000/v1",
openai_temperature: float = 0,
openai_max_tokens: int = 256,
) -> None: ) -> None:
"""Download llama model from HuggingFace and start the client. """Instantiate and start the client.
Args: Args:
room (`str`): room (`str`):
The room to join. The room to join.
password (`str`):
The password to log in with.
username (`str`): username (`str`):
The username to log in as. The username to log in as.
device_id (`str`): password (`str`):
The device ID to use. The password to log in with.
preprompt (`str`): preprompt (`str`):
The preprompt to use. The preprompt to use.
ggml_repoid (`str`, default `"TheBloke/stable-vicuna-13B-GGML"`): Defaults to `"You are a helpful assistant."`.
The HuggingFace Hub repo ID to download the model from. device_id (`str`):
ggml_filename (`str`, default `"stable-vicuna-13B.ggmlv3.q5_1.bin"`): The device ID to use.
The HuggingFace Hub filename to download the model from. Defaults to `"nio-llm"`.
homeserver (`str`, default `"matrix.org"`): homeserver (`str`):
The homeserver to connect to. The matrix homeserver to connect to.
sync_timeout (`int`, default `30000`): Defaults to `"https://matrix.org"`.
sync_timeout (`int`):
The timeout to use when syncing with the homeserver. The timeout to use when syncing with the homeserver.
Defaults to `30000`.
openai_api_key (`str`):
The OpenAI API key to use.
Defaults to `"osftw"`.
openai_api_endpoint (`str`):
The OpenAI API endpoint to use.
Defaults to `"http://localhost:8000/v1"`.
openai_temperature (`float`):
The OpenAI temperature to use.
Defaults to `0`.
openai_max_tokens (`int`):
The OpenAI max tokens to use.
Defaults to `256`.
""" """
# download the model
ggml_path = Path(
hf_hub_download(
repo_id=ggml_repoid,
filename=ggml_filename,
),
)
# create the client # create the client
client = LLMClient( client = LLMClient(
room=room, room=room,
username=username, username=username,
device_id=device_id, device_id=device_id,
ggml_path=ggml_path,
preprompt=preprompt, preprompt=preprompt,
homeserver=homeserver, homeserver=homeserver,
openai_api_key=openai_api_key,
openai_api_endpoint=openai_api_endpoint,
openai_temperature=openai_temperature,
openai_max_tokens=openai_max_tokens,
) )
# start the client # start the client
@ -86,6 +93,6 @@ if __name__ == "__main__":
CLI( CLI(
components=main, components=main,
as_positional=False, as_positional=False,
env_prefix="NIO_LLM", env_prefix="NIOLLM",
default_env=True, default_env=True,
) )

View file

@ -1,18 +1,15 @@
"""A Matrix client that uses Llama to respond to messages."""
import logging import logging
import time import time
from collections import deque from collections import deque
from pathlib import Path
from llama_cpp import Llama import openai
from nio import AsyncClient, MatrixRoom, RoomMessageText from nio import AsyncClient, MatrixRoom, RoomMessageText
logger = logging.getLogger("nio-llm.client") logger = logging.getLogger("nio-llm.client")
class LLMClient(AsyncClient): class LLMClient(AsyncClient):
"""A Matrix client that uses Llama to respond to messages.""" """A Matrix client that uses llama.cpp to respond to messages."""
def __init__( def __init__(
self, self,
@ -20,18 +17,33 @@ class LLMClient(AsyncClient):
homeserver: str, homeserver: str,
device_id: str, device_id: str,
preprompt: str, preprompt: str,
ggml_path: Path,
room: str, room: str,
): openai_api_key: str,
openai_api_endpoint: str,
openai_temperature: float,
openai_max_tokens: int,
) -> None:
"""Create a new LLMClient instance. """Create a new LLMClient instance.
Args: Args:
username (`str`): The username to log in as. username (`str`):
homeserver (`str`): The homeserver to connect to. The username to log in as.
device_id (`str`): The device ID to use. homeserver (`str`):
preprompt (`str`): The preprompt to use. The homeserver to connect to.
ggml_path (`Path`): The path to the GGML model. device_id (`str`):
room (`str`): The room to join. The device ID to use.
preprompt (`str`):
The preprompt to use.
room (`str`):
The room to join.
openai_api_key (`str`):
The OpenAI API key to use.
openai_api_endpoint (`str`):
The OpenAI API endpoint to use.
openai_temperature (`float`):
The OpenAI temperature to use.
openai_max_tokens (`int`):
The OpenAI max tokens to use.
""" """
self.uid = f"@{username}:{homeserver.removeprefix('https://')}" self.uid = f"@{username}:{homeserver.removeprefix('https://')}"
self.spawn_time = time.time() * 1000 self.spawn_time = time.time() * 1000
@ -39,20 +51,19 @@ class LLMClient(AsyncClient):
self.preprompt = preprompt self.preprompt = preprompt
self.room = room self.room = room
# create the AsyncClient instance # setup openai settings
openai.api_base = openai_api_endpoint
openai.api_key = openai_api_key
self.openai_temperature = openai_temperature
self.openai_max_tokens = openai_max_tokens
# create nio AsyncClient instance
super().__init__( super().__init__(
user=self.uid, user=self.uid,
homeserver=homeserver, homeserver=homeserver,
device_id=device_id, device_id=device_id,
) )
# create the Llama instance
self.llm = Llama(
model_path=str(ggml_path),
n_threads=12,
n_ctx=512 + 128,
)
# create message history queue # create message history queue
self.history: deque[RoomMessageText] = deque(maxlen=10) self.history: deque[RoomMessageText] = deque(maxlen=10)
@ -63,8 +74,10 @@ class LLMClient(AsyncClient):
"""Process new messages as they come in. """Process new messages as they come in.
Args: Args:
room (`MatrixRoom`): The room the message was sent in. room (`MatrixRoom`):
event (`RoomMessageText`): The message event. The room the message was sent in.
event (`RoomMessageText`):
The message event.
""" """
logger.debug(f"New RoomMessageText: {event.source}") logger.debug(f"New RoomMessageText: {event.source}")
@ -93,6 +106,7 @@ class LLMClient(AsyncClient):
# update history # update history
self.history.append(event) self.history.append(event)
logger.debug(f"Updated history: {self.history}")
# ignore our own messages # ignore our own messages
if event.sender == self.user: if event.sender == self.user:
@ -107,51 +121,46 @@ class LLMClient(AsyncClient):
and f'<a href="https://matrix.to/#/{self.uid}">{self.username}</a>' and f'<a href="https://matrix.to/#/{self.uid}">{self.username}</a>'
in event.source["content"]["formatted_body"] in event.source["content"]["formatted_body"]
): ):
logger.debug("Ignoring message not directed at us.") logger.debug("Ignoring message not mentioning us.")
return
# generate prompt from message and history
history = "\n".join(f"<{message.sender}>: {message.body}" for message in self.history)
prompt = "\n".join([self.preprompt, history, f"<{self.uid}>:"])
tokens = self.llm.tokenize(str.encode(prompt))
logger.debug(f"Prompt:\n{prompt}")
logger.debug(f"Tokens: {len(tokens)}")
# ignore prompts that are too long
if len(tokens) > 512:
logger.debug("Prompt too long, skipping.")
await self.room_send(
room_id=self.room,
message_type="m.room.message",
content={
"msgtype": "m.emote",
"body": "reached prompt token limit",
},
)
return return
# enable typing indicator # enable typing indicator
await self.room_typing( await self.room_typing(
self.room, self.room,
typing_state=True, typing_state=True,
timeout=100000000, timeout=30000,
) )
logger.debug("Enabled typing indicator.")
# generate response using llama.cpp # generate response using llama.cpp
senders = [f"<{message.sender}>" for message in self.history] response = openai.ChatCompletion.create(
output = self.llm( model="local-model",
prompt, messages=[
max_tokens=128, {
stop=[f"<{self.uid}>", "### Human", "### Assistant", *senders], "content": self.preprompt,
echo=True, "role": "system",
},
*[
{
"content": f"{message.sender}: {message.body}",
"role": "assistant" if message.sender == self.uid else "user",
}
for message in self.history
],
],
stop=["<|im_end|>"],
temperature=self.openai_temperature,
max_tokens=self.openai_max_tokens,
) )
logger.debug(f"Generated response: {response}")
# retreive the response # retreive the response
output = output["choices"][0]["text"] # type: ignore output = response["choices"][0]["message"]["content"] # type: ignore
output = output.removeprefix(prompt).strip() output = output.strip().removeprefix(f"{self.uid}:").strip()
# disable typing indicator # disable typing indicator
await self.room_typing(self.room, typing_state=False) await self.room_typing(self.room, typing_state=False)
logger.debug("Disabled typing indicator.")
# send the response # send the response
await self.room_send( await self.room_send(
@ -162,8 +171,9 @@ class LLMClient(AsyncClient):
"body": output, "body": output,
}, },
) )
logger.debug(f"Sent response: {output}")
async def start(self, password, sync_timeout=30000): async def start(self, password, sync_timeout=30000) -> None:
"""Start the client. """Start the client.
Args: Args:

View file

@ -0,0 +1,33 @@
FROM python:3
# Update and upgrade the existing packages
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install -y --no-install-recommends \
ninja-build \
libopenblas-dev \
build-essential
RUN mkdir /app
WORKDIR /app
# install python dependencies
RUN pip install --upgrade pip
RUN pip install huggingface_hub
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
pip install llama-cpp-python[server]@git+https://github.com/abetlen/llama-cpp-python.git --verbose
# Set environment variable for the host
ENV HOST=0.0.0.0
ENV PORT=8000
ENV HF_REPO=TheBloke/Mistral-7B-OpenOrca-GGUF
ENV HF_FILE=mistral-7b-openorca.Q4_K_M.gguf
ENV MODEL_ALIAS=local-model
ENV CHAT_FORMAT=chatml
# Expose a port for the server
EXPOSE 8000
COPY run.sh /app
# Run the server start script
CMD ["/bin/sh", "/app/run.sh"]

View file

@ -0,0 +1,6 @@
#!/bin/bash
huggingface-cli download $HF_REPO $HF_FILE
MODEL_PATH=`huggingface-cli download --quiet $HF_REPO $HF_FILE`
python3 -m llama_cpp.server --host $HOST --port $PORT --model $MODEL_PATH --model_alias $MODEL_ALIAS --chat_format $CHAT_FORMAT