89 lines
2.9 KiB
Python

# Import the Llama class from the llama_cpp library
import llama_cpp
import json
import random
from . import tool_funcs
tools: list[dict] = json.loads(open('./textgen/tools.json', 'r').read())['tools']
class TextGen:
llm: llama_cpp.Llama
messages: list[dict] = [
{"role": "system", "content": "You are a helpful assistant that can use tools. When a function is called, return the results to the user."}
]
def __init__(self, model_path: str, n_ctx: int, n_gpu_layers: int):
# 1. Instantiate the Llama model
# Provide the path to your downloaded .gguf file
# n_ctx is the maximum context size (number of tokens) the model can handle.
# n_gpu_layers specifies how many layers to offload to the GPU. -1 means offload all possible layers.
self.llm = llama_cpp.Llama(
model_path=model_path, # Path to your GGUF model
n_ctx=n_ctx, # Context window size
n_gpu_layers=n_gpu_layers, # Offload all layers to GPU. Set to 0 if no GPU.
verbose=False, # Suppress verbose output
chat_format='chatml'
)
def generate(self, prompt: str) -> str:
# 3. Generate text
# The llm object is callable. Pass the prompt to it.
# max_tokens is the maximum number of tokens to generate.
# stop is a list of strings that will stop the generation when encountered.
# echo=True will include your prompt in the output.
output = self.llm(
prompt,
max_tokens=200,
echo=True
)
# 4. Print the result
# The output is a dictionary. The generated text is in 'choices'[0]['text'].
text = output['choices'][0]['text']
print(text)
return text
def chat_completion(self, user_message: str) -> str:
self.messages.append({
"role": "user",
"content": user_message
})
print(tools)
response = self.llm.create_chat_completion(
messages=self.messages,
tools=tools,
tool_choice='auto',
seed=random.randint(0, 20000000000)
)
tool_call = response['choices'][0]['message'].get('tool_calls')
if not tool_call:
# print(response['choices'][0]['message']['content'])
return response['choices'][0]['message']['content']
call_info = tool_call[0]['function']
function_name = call_info['name']
print(f'Assistant decided to call {function_name}')
tool_output = tool_funcs.get_high_low()
self.messages.append(response['choices'][0]['message'])
self.messages.append(
{
"role" : "tool",
"content": tool_output
}
)
final_response = self.llm.create_chat_completion(messages=self.messages)
return final_response['choices'][0]['message']['content']