diff --git a/examples/self_tokenize.py b/examples/self_tokenize.py index 372f1ac5a8..26c0e7de94 100644 --- a/examples/self_tokenize.py +++ b/examples/self_tokenize.py @@ -1,4 +1,4 @@ -import os, pathlib +import os, pathlib, argparse from examples.llama3 import Tokenizer from tabulate import tabulate from tinygrad import fetch @@ -18,7 +18,16 @@ def read_code(base_path): ret += [(fullpath.split("tinygrad/", 1)[1], code)] return ret +def write_code_to_file(filename, code_list): + """Writes the combined code to a specified file.""" + with open(filename, 'w') as f: + f.write('\x00'.join(flatten(code_list))) + if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Analyze and optionally save tinygrad code.") + parser.add_argument("--output", help="Output file to write the combined code to.") + args = parser.parse_args() + ret = read_code(".") table = [] @@ -33,3 +42,7 @@ if __name__ == "__main__": encoded = tokenizer.encode(code_str) print(f"code has {len(encoded)} tokens") + + if args.output: + write_code_to_file(args.output, ret) + print(f"Combined code written to {args.output}") \ No newline at end of file