From e8a23d43311ad751bb23919657e76a0e7382358d Mon Sep 17 00:00:00 2001 From: cloud11665 Date: Wed, 7 Jun 2023 00:23:30 +0200 Subject: [PATCH] there is a better way to do that! (#950) --- examples/whisper.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/whisper.py b/examples/whisper.py index 40e4009c06..54e341fed2 100644 --- a/examples/whisper.py +++ b/examples/whisper.py @@ -106,6 +106,7 @@ class Whisper: # TODO: this is tragic. remove this import functools +import itertools import torch import torchaudio import librosa @@ -158,10 +159,8 @@ def get_encoding(n_vocab_in): "<|notimestamps|>", *[f"<|{i * 0.02:.2f}|>" for i in range(1501)], ] - special_tokens = {} - for token in specials: - special_tokens[token] = n_vocab - n_vocab += 1 + special_tokens = dict(zip(specials, itertools.count(n_vocab))) + n_vocab += len(specials) assert n_vocab == n_vocab_in import tiktoken return tiktoken.Encoding(