From 9604773e459a77e45f8d640e22cab0403ac9890b Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:22:11 -0500 Subject: [PATCH] add model choosing support to llm (#13656) --- tinygrad/apps/llm.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tinygrad/apps/llm.py b/tinygrad/apps/llm.py index 89b64629e0..e11cec5222 100644 --- a/tinygrad/apps/llm.py +++ b/tinygrad/apps/llm.py @@ -174,10 +174,10 @@ class Transformer: yield next_id models = { - "1B": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q6_K.gguf", - "3B": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K.gguf", - "3B_f16": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-f16.gguf", - "8B": "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf", + "llama3.2:1b": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q6_K.gguf", + "llama3.2:3b": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K.gguf", + "llama3.2:3b-f16": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-f16.gguf", + "llama3.1:8b": "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf", } # *** simple OpenAI compatible server on 11434 to match ollama *** @@ -226,13 +226,14 @@ class Handler(HTTPRequestHandler): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--size", choices=list(models.keys()), default=list(models.keys())[0], help="Model size") + parser.add_argument("--model", choices=list(models.keys()), default=list(models.keys())[0], help="Model choice") parser.add_argument("--max_context", type=int, default=4096, help="Max Context Length") parser.add_argument("--serve", action="store_true", help="Run OpenAI compatible API") args = parser.parse_args() # load the model - model, kv = Transformer.from_gguf(Tensor.from_url(models[args.size]), args.max_context) + model, kv = Transformer.from_gguf(Tensor.from_url(models[args.model]), args.max_context) + if DEBUG >= 1: print(f"using model {args.model}") # extract some metadata tok = SimpleTokenizer.from_gguf_kv(kv)