2025/02/11 10:54:59 routes.go:1187: INFO server config env="map[HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:true OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE:q8_0 OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/Users/[user]/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://*] OLLAMA_SCHED_SPREAD:false http_proxy: https_proxy: no_proxy:]" time=2025-02-11T10:54:59.951-05:00 level=INFO source=images.go:432 msg="total blobs: 21" time=2025-02-11T10:54:59.952-05:00 level=INFO source=images.go:439 msg="total unused blobs removed: 0" time=2025-02-11T10:54:59.952-05:00 level=INFO source=routes.go:1238 msg="Listening on 127.0.0.1:11434 (version 0.5.7)" time=2025-02-11T10:54:59.952-05:00 level=INFO source=routes.go:1267 msg="Dynamic LLM libraries" runners=[metal] time=2025-02-11T10:55:00.001-05:00 level=INFO source=types.go:131 msg="inference compute" id=0 library=metal variant="" compute="" driver=0.0 name="" total="48.0 GiB" available="48.0 GiB" [GIN] 2025/02/11 - 10:55:18 | 200 | 43.75µs | 127.0.0.1 | GET "/api/version" [GIN] 2025/02/11 - 10:55:22 | 200 | 55.542µs | 127.0.0.1 | HEAD "/" [GIN] 2025/02/11 - 10:55:22 | 200 | 46.954167ms | 127.0.0.1 | POST "/api/show" time=2025-02-11T10:55:22.449-05:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/Users/[user]/.ollama/models/blobs/sha256-d7e4b00a7d7a8d03d4eed9b0f3f61a427e9f0fc5dea6aeb414e41dee23dc8ecc gpu=0 parallel=4 available=51539607552 required="17.4 GiB" time=2025-02-11T10:55:22.450-05:00 level=INFO source=server.go:104 msg="system memory" total="64.0 GiB" free="33.1 GiB" free_swap="0 B" time=2025-02-11T10:55:22.451-05:00 level=INFO source=memory.go:356 msg="offload to metal" layers.requested=-1 layers.model=47 layers.offload=47 layers.split="" memory.available="[48.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="17.4 GiB" memory.required.partial="17.4 GiB" memory.required.kv="1.4 GiB" memory.required.allocations="[17.4 GiB]" memory.weights.total="15.1 GiB" memory.weights.repeating="14.2 GiB" memory.weights.nonrepeating="922.9 MiB" memory.graph.full="562.0 MiB" memory.graph.partial="562.0 MiB" time=2025-02-11T10:55:22.451-05:00 level=INFO source=server.go:223 msg="enabling flash attention" time=2025-02-11T10:55:22.453-05:00 level=INFO source=server.go:376 msg="starting llama server" cmd="/opt/homebrew/bin/ollama runner --model /Users/[user]/.ollama/models/blobs/sha256-d7e4b00a7d7a8d03d4eed9b0f3f61a427e9f0fc5dea6aeb414e41dee23dc8ecc --ctx-size 8192 --batch-size 512 --n-gpu-layers 47 --threads 8 --flash-attn --kv-cache-type q8_0 --parallel 4 --port 58737" time=2025-02-11T10:55:22.454-05:00 level=INFO source=sched.go:449 msg="loaded runners" count=1 time=2025-02-11T10:55:22.454-05:00 level=INFO source=server.go:555 msg="waiting for llama runner to start responding" time=2025-02-11T10:55:22.455-05:00 level=INFO source=server.go:589 msg="waiting for server to become available" status="llm server error" time=2025-02-11T10:55:22.475-05:00 level=INFO source=runner.go:936 msg="starting go runner" time=2025-02-11T10:55:22.475-05:00 level=INFO source=runner.go:937 msg=system info="Metal : EMBED_LIBRARY = 1 | CPU : NEON = 1 | ARM_FMA = 1 | FP16_VA = 1 | DOTPROD = 1 | LLAMAFILE = 1 | ACCELERATE = 1 | AARCH64_REPACK = 1 | Metal : EMBED_LIBRARY = 1 | CPU : NEON = 1 | ARM_FMA = 1 | FP16_VA = 1 | DOTPROD = 1 | LLAMAFILE = 1 | ACCELERATE = 1 | AARCH64_REPACK = 1 | cgo(clang)" threads=8 time=2025-02-11T10:55:22.476-05:00 level=INFO source=runner.go:995 msg="Server listening on 127.0.0.1:58737" llama_load_model_from_file: using device Metal (Apple M1 Max) - 49151 MiB free llama_model_loader: loaded meta data with 29 key-value pairs and 508 tensors from /Users/[user]/.ollama/models/blobs/sha256-d7e4b00a7d7a8d03d4eed9b0f3f61a427e9f0fc5dea6aeb414e41dee23dc8ecc (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = gemma2 llama_model_loader: - kv 1: general.name str = gemma-2-27b-it llama_model_loader: - kv 2: gemma2.context_length u32 = 8192 llama_model_loader: - kv 3: gemma2.embedding_length u32 = 4608 llama_model_loader: - kv 4: gemma2.block_count u32 = 46 llama_model_loader: - kv 5: gemma2.feed_forward_length u32 = 36864 llama_model_loader: - kv 6: gemma2.attention.head_count u32 = 32 llama_model_loader: - kv 7: gemma2.attention.head_count_kv u32 = 16 llama_model_loader: - kv 8: gemma2.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 9: gemma2.attention.key_length u32 = 128 llama_model_loader: - kv 10: gemma2.attention.value_length u32 = 128 llama_model_loader: - kv 11: general.file_type u32 = 2 llama_model_loader: - kv 12: gemma2.attn_logit_softcapping f32 = 50.000000 llama_model_loader: - kv 13: gemma2.final_logit_softcapping f32 = 30.000000 llama_model_loader: - kv 14: gemma2.attention.sliding_window u32 = 4096 llama_model_loader: - kv 15: tokenizer.ggml.model str = llama llama_model_loader: - kv 16: tokenizer.ggml.pre str = default llama_model_loader: - kv 17: tokenizer.ggml.tokens arr[str,256000] = ["", "", "", "", ... llama_model_loader: - kv 18: tokenizer.ggml.scores arr[f32,256000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 19: tokenizer.ggml.token_type arr[i32,256000] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ... llama_model_loader: - kv 20: tokenizer.ggml.bos_token_id u32 = 2 llama_model_loader: - kv 21: tokenizer.ggml.eos_token_id u32 = 1 llama_model_loader: - kv 22: tokenizer.ggml.unknown_token_id u32 = 3 llama_model_loader: - kv 23: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 24: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 25: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 26: tokenizer.chat_template str = {{ bos_token }}{% if messages[0]['rol... llama_model_loader: - kv 27: tokenizer.ggml.add_space_prefix bool = false llama_model_loader: - kv 28: general.quantization_version u32 = 2 llama_model_loader: - type f32: 185 tensors llama_model_loader: - type q4_0: 322 tensors llama_model_loader: - type q6_K: 1 tensors llm_load_vocab: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect llm_load_vocab: special tokens cache size = 108 llm_load_vocab: token to piece cache size = 1.6014 MB llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = gemma2 llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 256000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: vocab_only = 0 llm_load_print_meta: n_ctx_train = 8192 llm_load_print_meta: n_embd = 4608 llm_load_print_meta: n_layer = 46 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 16 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_swa = 4096 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 2 llm_load_print_meta: n_embd_k_gqa = 2048 llm_load_print_meta: n_embd_v_gqa = 2048 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: f_logit_scale = 0.0e+00 llm_load_print_meta: n_ff = 36864 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 2 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_ctx_orig_yarn = 8192 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 llm_load_print_meta: ssm_dt_b_c_rms = 0 llm_load_print_meta: model type = 27B llm_load_print_meta: model ftype = Q4_0 llm_load_print_meta: model params = 27.23 B llm_load_print_meta: model size = 14.55 GiB (4.59 BPW) llm_load_print_meta: general.name = gemma-2-27b-it llm_load_print_meta: BOS token = 2 '' llm_load_print_meta: EOS token = 1 '' llm_load_print_meta: EOT token = 107 '' llm_load_print_meta: UNK token = 3 '' llm_load_print_meta: PAD token = 0 '' llm_load_print_meta: LF token = 227 '<0x0A>' llm_load_print_meta: EOG token = 1 '' llm_load_print_meta: EOG token = 107 '' llm_load_print_meta: max token length = 93 time=2025-02-11T10:55:22.706-05:00 level=INFO source=server.go:589 msg="waiting for server to become available" status="llm server loading model" llm_load_tensors: offloading 46 repeating layers to GPU llm_load_tensors: offloading output layer to GPU llm_load_tensors: offloaded 47/47 layers to GPU llm_load_tensors: CPU_Mapped model buffer size = 922.85 MiB llm_load_tensors: Metal_Mapped model buffer size = 14898.62 MiB llama_new_context_with_model: n_seq_max = 4 llama_new_context_with_model: n_ctx = 8192 llama_new_context_with_model: n_ctx_per_seq = 2048 llama_new_context_with_model: n_batch = 2048 llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: flash_attn = 1 llama_new_context_with_model: freq_base = 10000.0 llama_new_context_with_model: freq_scale = 1 llama_new_context_with_model: n_ctx_per_seq (2048) < n_ctx_train (8192) -- the full capacity of the model will not be utilized ggml_metal_init: allocating ggml_metal_init: found device: Apple M1 Max ggml_metal_init: picking default device: Apple M1 Max ggml_metal_init: using embedded metal library ggml_metal_init: GPU name: Apple M1 Max ggml_metal_init: GPU family: MTLGPUFamilyApple7 (1007) ggml_metal_init: GPU family: MTLGPUFamilyCommon3 (3003) ggml_metal_init: GPU family: MTLGPUFamilyMetal3 (5001) ggml_metal_init: simdgroup reduction = true ggml_metal_init: simdgroup matrix mul. = true ggml_metal_init: has bfloat = true ggml_metal_init: use bfloat = false ggml_metal_init: hasUnifiedMemory = true ggml_metal_init: recommendedMaxWorkingSetSize = 51539.61 MB ggml_metal_init: skipping kernel_get_rows_bf16 (not supported) ggml_metal_init: skipping kernel_mul_mv_bf16_f32 (not supported) ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row (not supported) ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4 (not supported) ggml_metal_init: skipping kernel_mul_mv_bf16_bf16 (not supported) ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32 (not supported) ggml_metal_init: skipping kernel_mul_mm_bf16_f32 (not supported) ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h96 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h112 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h128 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h256 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_vec_bf16_h128 (not supported) ggml_metal_init: skipping kernel_flash_attn_ext_vec_bf16_h256 (not supported) ggml_metal_init: skipping kernel_cpy_f32_bf16 (not supported) ggml_metal_init: skipping kernel_cpy_bf16_f32 (not supported) ggml_metal_init: skipping kernel_cpy_bf16_bf16 (not supported) llama_kv_cache_init: kv_size = 8192, offload = 1, type_k = 'q8_0', type_v = 'q8_0', n_layer = 46, can_shift = 1 llama_kv_cache_init: Metal KV buffer size = 1564.00 MiB llama_new_context_with_model: KV self size = 1564.00 MiB, K (q8_0): 782.00 MiB, V (q8_0): 782.00 MiB llama_new_context_with_model: CPU output buffer size = 3.98 MiB llama_new_context_with_model: Metal compute buffer size = 509.00 MiB llama_new_context_with_model: CPU compute buffer size = 41.01 MiB llama_new_context_with_model: graph nodes = 1530 llama_new_context_with_model: graph splits = 2 time=2025-02-11T10:55:34.523-05:00 level=INFO source=server.go:594 msg="llama runner started in 12.07 seconds" [GIN] 2025/02/11 - 10:55:34 | 200 | 12.134892084s | 127.0.0.1 | POST "/api/generate" [GIN] 2025/02/11 - 10:56:39 | 200 | 25.383590833s | 127.0.0.1 | POST "/api/chat" [GIN] 2025/02/11 - 10:56:55 | 200 | 28.5µs | 127.0.0.1 | GET "/api/version"