41 lines
1.5 KiB
Python
41 lines
1.5 KiB
Python
# Import the Llama class from the llama_cpp library
|
|
import llama_cpp
|
|
import json
|
|
|
|
tools = json.loads(open('tools.json', 'r').read())['tools']
|
|
|
|
class TextGen:
|
|
|
|
llm: llama_cpp.Llama
|
|
|
|
def __init__(self, model_path: str, n_ctx: int, n_gpu_layers: int):
|
|
# 1. Instantiate the Llama model
|
|
# Provide the path to your downloaded .gguf file
|
|
# n_ctx is the maximum context size (number of tokens) the model can handle.
|
|
# n_gpu_layers specifies how many layers to offload to the GPU. -1 means offload all possible layers.
|
|
llm = llama_cpp.Llama(
|
|
model_path="./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", # Path to your GGUF model
|
|
n_ctx=n_ctx, # Context window size
|
|
n_gpu_layers=n_gpu_layers, # Offload all layers to GPU. Set to 0 if no GPU.
|
|
verbose=False # Suppress verbose output
|
|
)
|
|
|
|
def generate(self, prompt: str) -> str:
|
|
# 3. Generate text
|
|
# The llm object is callable. Pass the prompt to it.
|
|
# max_tokens is the maximum number of tokens to generate.
|
|
# stop is a list of strings that will stop the generation when encountered.
|
|
# echo=True will include your prompt in the output.
|
|
output = self.llm(
|
|
prompt,
|
|
max_tokens=200,
|
|
echo=True
|
|
)
|
|
|
|
# 4. Print the result
|
|
# The output is a dictionary. The generated text is in 'choices'[0]['text'].
|
|
text = output['choices'][0]['text']
|
|
print(text)
|
|
|
|
return text
|