77a8919349
* UV+DTR model * DTR model.. again. * fix naviGPS * fix radar... * fix.. * test * fix.. * carrot serv * fix.. * fix.. fleet * fix.. radar * fix atc * Steam Powered model.. * fix.. radarLatFactor range.. 200->500 * fix.. dbc.. * side * SP v2 * brake light * fix brakelight * fix.. * add datetime... * fix.. * fix.. * fix.. * fix.. * blind spot * fix tz * fix.. * ff * radarLatFactor * fix.. bsd * Revert "fix.. bsd" This reverts commit 1d0d1434470e1b92c65eaffaeb8dd7cd779f85ee. * fix.. bsd side.. * test * fix.. e2e conditions * Revert "test" This reverts commit 0ce791dbd66c17260366ed1a4df2626c602dbb7d. * TR16 * fix cut-in detect threshold 3.4 -> 2.6 * fix.. jerk_l limit 5->10 * fix.. * fix.. gm * fix.. OPTIMA_H mass * fix.. radar.. * fix radar.. * fix.. * Radar... * fix.. * fix.. * fix.. * fix.. radartrack 3 * fix.. * fix.. * fix.. * merge.. * fix.. canfd * fix.. * fix.. * fix.. * fix.. radard * new cut_in * Revert "new cut_in" This reverts commit b9b6e9b33318fe1ce7d626468139b17848efcdcd. * fix.. * new cut_in detect... * fix.. disp.. * fix.. * fix.. * fix.. center radar.. * fix.. radar y_sane.. * fix.. * fix.. * hkg jerk 10 -> 5 * fix.. * fix.. * fix.. radar dbc.. * fix.. * fix.. jLead filter.. * test new radar interface.. * fix.. * fix.. * test time... * Revert "test time..." This reverts commit 63e9187736985c4dc4b4f3736674ba7cda6adc3f. * fix radar.. * fix.. * FireHose model.. * tinygrad * Update interface.py * fix.. * fix.. nff toyota corolla_tss2 * fix.. * fix.. * fix.. radar * fix.. * fix.. radar, y_gate * fix.. radar.. * fix.. for clone.. * scc radar enable at low speed.. * fix.. settings.. * fix. * fix.. * fix.. radarTimeStep. * TR16 model again.. * RELEASE.md * fix cut-in detection... * fix.. registeration timeout 15sec.. * fix.. * fix.. radar processing. * fix.. * fix.. * fix.. * fix.. * fix.. * fix..
42 lines
1.7 KiB
Python
42 lines
1.7 KiB
Python
from transformers import AutoTokenizer
|
|
from datasets import load_dataset
|
|
from tinygrad.apps.llm import SimpleTokenizer, gpt2_decode_vocab, get_llama_re
|
|
from tinygrad.helpers import tqdm, getenv, partition
|
|
|
|
# use ALLOW_FAILED=-1 to go over the entire dataset without printing.
|
|
if __name__ == "__main__":
|
|
base_tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
|
|
special_tokens, normal_tokens = partition(((t, tid) for t, tid in base_tokenizer.vocab.items()),
|
|
lambda e: e[1] in base_tokenizer.all_special_ids)
|
|
inv_vocab = { tid: word for word, tid in base_tokenizer.get_vocab().items() }
|
|
simple_tokenizer = SimpleTokenizer(get_llama_re(), gpt2_decode_vocab(dict(normal_tokens)), dict(special_tokens))
|
|
|
|
color_codes = [ 91, 92, 94, 93, 95 ]
|
|
def color_tokens(tids):
|
|
return "".join(f"\033[{color_codes[i%len(color_codes)]}m{base_tokenizer.decode([t])}" for i, t in enumerate(tids)) + "\033[0m"
|
|
|
|
ds = load_dataset("OpenAssistant/oasst1")
|
|
allow_failed = getenv("ALLOW_FAILED", 10)
|
|
|
|
fail_count, total = 0, 0
|
|
|
|
for idx, el in enumerate(tqdm(ds["train"])):
|
|
total += 1
|
|
|
|
try: simple_tokens = tuple(simple_tokenizer.encode(el["text"]))
|
|
except RuntimeError: simple_tokens = ()
|
|
base_tokens = tuple(base_tokenizer.encode(el["text"], add_special_tokens=False))
|
|
|
|
if simple_tokens != base_tokens:
|
|
fail_count += 1
|
|
allow_failed -= 1
|
|
|
|
if allow_failed >= 0:
|
|
print(f"tokens mismatch at index: {idx}.\n")
|
|
|
|
print("simple: ", color_tokens(simple_tokens))
|
|
print("official:", color_tokens(base_tokens) + "\n")
|
|
|
|
if allow_failed == 0: break
|
|
print(f"{fail_count}/{total} samples are inconsistent with the official tokenizer.")
|