# Import the Llama class from the llama_cpp library import llama_cpp import json from . import tool_funcs tools: list[dict] = json.loads(open('tools.json', 'r').read())['tools'] class TextGen: llm: llama_cpp.Llama messages: list[dict] = [ {"role": "system", "content": "You are a helpful assistant that can use tools. When a function is called, return the results to the user."} ] def __init__(self, model_path: str, n_ctx: int, n_gpu_layers: int): # 1. Instantiate the Llama model # Provide the path to your downloaded .gguf file # n_ctx is the maximum context size (number of tokens) the model can handle. # n_gpu_layers specifies how many layers to offload to the GPU. -1 means offload all possible layers. llm = llama_cpp.Llama( model_path="./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", # Path to your GGUF model n_ctx=n_ctx, # Context window size n_gpu_layers=n_gpu_layers, # Offload all layers to GPU. Set to 0 if no GPU. verbose=False, # Suppress verbose output chat_format='chatml' ) def generate(self, prompt: str) -> str: # 3. Generate text # The llm object is callable. Pass the prompt to it. # max_tokens is the maximum number of tokens to generate. # stop is a list of strings that will stop the generation when encountered. # echo=True will include your prompt in the output. output = self.llm( prompt, max_tokens=200, echo=True ) # 4. Print the result # The output is a dictionary. The generated text is in 'choices'[0]['text']. text = output['choices'][0]['text'] print(text) return text def chat_completion(self, user_message: str) -> str: self.messages.append({ "role": "user", "content": user_message }) response = self.llm.create_chat_completion( messages=self.messages, tools=tools, tool_choice='auto' ) tool_call = response['choices'][0]['message'].get('tool_calls') if not tool_call: return response['choices'][0]['message']['content'] call_info = tool_call[0]['function'] function_name = call_info['name'] print(f'Assistant decided to call {function_name}') tool_output = tool_funcs.get_high_low()