Initial working copy
This commit is contained in:
parent
2a335176e6
commit
2c876cef42
19 changed files with 783 additions and 126 deletions
117
completion/llamacpp/llamacpp.c
Normal file
117
completion/llamacpp/llamacpp.c
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
#include "llama.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
void null_log_callback(enum ggml_log_level level, const char *message, void *user_data) {}
|
||||
|
||||
int64_t time_us(void) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
|
||||
}
|
||||
|
||||
void chandlerscustomllama(char* prompt) {
|
||||
int n_predict = 100;
|
||||
|
||||
printf("Prompt: %s\n", prompt);
|
||||
struct llama_model_params model_params = llama_model_default_params();
|
||||
// printf("model_params.n_gpu_layers: %d\n", model_params.n_gpu_layers);
|
||||
llama_log_set(null_log_callback, NULL); // Disable logging
|
||||
|
||||
struct llama_model *model = llama_model_load_from_file("/home/chandler/llms/gpt-oss-20b-Q4_K_M.gguf", model_params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "Failed to load model\n");
|
||||
return;
|
||||
}
|
||||
|
||||
const struct llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_prompt = -llama_tokenize(vocab, prompt, strlen(prompt), NULL, 0, true, true);
|
||||
|
||||
llama_token * prompt_tokens = malloc(sizeof(llama_token) * n_prompt);
|
||||
if (llama_tokenize(vocab, prompt, strlen(prompt), prompt_tokens, n_prompt, true, true) < 0) {
|
||||
fprintf(stderr, "%s: error: failed to tokenize prompt\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
struct llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_prompt + n_predict - 1;
|
||||
ctx_params.n_batch = n_prompt;
|
||||
ctx_params.no_perf = false; // TODO: true
|
||||
|
||||
struct llama_context * ctx = llama_init_from_model(model, ctx_params);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to create llama_context\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
// initialize the sampler
|
||||
struct llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
|
||||
struct llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||
|
||||
// prepare a batch for the prompt
|
||||
struct llama_batch batch = llama_batch_get_one(prompt_tokens, n_prompt);
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
if (llama_encode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = llama_vocab_bos(vocab);
|
||||
}
|
||||
|
||||
batch = llama_batch_get_one(&decoder_start_token_id, 1);
|
||||
}
|
||||
|
||||
int64_t start = time_us();
|
||||
int n_decode = 0;
|
||||
llama_token new_token_id;
|
||||
|
||||
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
}
|
||||
|
||||
n_pos += batch.n_tokens;
|
||||
|
||||
|
||||
// sample the next token
|
||||
{
|
||||
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_vocab_is_eog(vocab, new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
char buf[128]; // TODO: how do we know that this is enough?
|
||||
int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true); // TODO: do I want special tokens?
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
||||
return;
|
||||
}
|
||||
buf[n] = 0;
|
||||
printf("%s", buf); // TODO: null terminator?
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int64_t end = time_us();
|
||||
|
||||
fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (end - start) / 1000000.0f, n_decode / ((end - start) / 1000000.0f));
|
||||
llama_sampler_free(smpl);
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue