# Import the Llama class from the llama_cpp library import llama_cpp import json import random from . import tool_funcs tools: list[dict] = json.loads(open('./textgen/tools.json', 'r').read())['tools'] class TextGen: llm: llama_cpp.Llama messages: list[dict] = [ {"role": "system", "content": "You are a helpful assistant that can use tools. When a function is called, return the results to the user."} ] def __init__(self, model_path: str, n_ctx: int, n_gpu_layers: int): # 1. Instantiate the Llama model # Provide the path to your downloaded .gguf file # n_ctx is the maximum context size (number of tokens) the model can handle. # n_gpu_layers specifies how many layers to offload to the GPU. -1 means offload all possible layers. self.llm = llama_cpp.Llama( model_path=model_path, # Path to your GGUF model n_ctx=n_ctx, # Context window size n_gpu_layers=n_gpu_layers, # Offload all layers to GPU. Set to 0 if no GPU. verbose=False, # Suppress verbose output chat_format='chatml' ) def generate(self, prompt: str) -> str: # 3. Generate text # The llm object is callable. Pass the prompt to it. # max_tokens is the maximum number of tokens to generate. # stop is a list of strings that will stop the generation when encountered. # echo=True will include your prompt in the output. output = self.llm( prompt, max_tokens=200, echo=True ) # 4. Print the result # The output is a dictionary. The generated text is in 'choices'[0]['text']. text = output['choices'][0]['text'] print(text) return text def chat_completion(self, user_message: str) -> str: self.messages.append({ "role": "user", "content": user_message }) print(tools) response = self.llm.create_chat_completion( messages=self.messages, tools=tools, tool_choice='auto', seed=random.randint(0, 20000000000) ) tool_call = response['choices'][0]['message'].get('tool_calls') if not tool_call: # print(response['choices'][0]['message']['content']) return response['choices'][0]['message']['content'] call_info = tool_call[0]['function'] function_name = call_info['name'] print(f'Assistant decided to call {function_name}') tool_output = tool_funcs.get_high_low() self.messages.append(response['choices'][0]['message']) self.messages.append( { "role" : "tool", "content": tool_output } ) final_response = self.llm.create_chat_completion(messages=self.messages) return final_response['choices'][0]['message']['content']