RoboServant/textgen/llm.py

# Import the Llama class from the llama_cpp library
import llama_cpp
import json
import random

from . import tool_funcs

tools: list[dict] = json.loads(open('./textgen/tools.json', 'r').read())['tools']

class TextGen:

    llm: llama_cpp.Llama

    messages: list[dict] = [
        {"role": "system", "content": "You are a helpful assistant that can use tools. When a function is called, return the results to the user."}
    ]

    def __init__(self, model_path: str, n_ctx: int, n_gpu_layers: int):
        # 1. Instantiate the Llama model
        # Provide the path to your downloaded .gguf file
        # n_ctx is the maximum context size (number of tokens) the model can handle.
        # n_gpu_layers specifies how many layers to offload to the GPU. -1 means offload all possible layers.
        self.llm = llama_cpp.Llama(
            model_path=model_path,  # Path to your GGUF model
            n_ctx=n_ctx,           # Context window size
            n_gpu_layers=n_gpu_layers,        # Offload all layers to GPU. Set to 0 if no GPU.
            verbose=False,         # Suppress verbose output
            chat_format='chatml'
        )

    def generate(self, prompt: str) -> str:
        # 3. Generate text
        # The llm object is callable. Pass the prompt to it.
        # max_tokens is the maximum number of tokens to generate.
        # stop is a list of strings that will stop the generation when encountered.
        # echo=True will include your prompt in the output.
        output = self.llm(
            prompt,
            max_tokens=200,
            echo=True
        )

        # 4. Print the result
        # The output is a dictionary. The generated text is in 'choices'[0]['text'].
        text = output['choices'][0]['text']
        print(text)

        return text

    def chat_completion(self, user_message: str) -> str:
        self.messages.append({
            "role": "user",
            "content": user_message
        })

        print(tools)

        response = self.llm.create_chat_completion(
            messages=self.messages,
            tools=tools,
            tool_choice='auto',
            seed=random.randint(0, 20000000000)
        )

        tool_call = response['choices'][0]['message'].get('tool_calls')
        if not tool_call:
            # print(response['choices'][0]['message']['content'])
            return response['choices'][0]['message']['content']

        call_info = tool_call[0]['function']
        function_name = call_info['name']

        print(f'Assistant decided to call {function_name}')

        tool_output = tool_funcs.get_high_low()

        self.messages.append(response['choices'][0]['message'])
        self.messages.append(
            {
                "role" : "tool",
                "content": tool_output
            }
        )

        final_response = self.llm.create_chat_completion(messages=self.messages)
        return final_response['choices'][0]['message']['content']