there is a better way to do that! (#950)

This commit is contained in:
cloud11665
2023-06-07 00:23:30 +02:00
committed by GitHub
parent 990fc40219
commit e8a23d4331

View File

@@ -106,6 +106,7 @@ class Whisper:
# TODO: this is tragic. remove this
import functools
import itertools
import torch
import torchaudio
import librosa
@@ -158,10 +159,8 @@ def get_encoding(n_vocab_in):
"<|notimestamps|>",
*[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
]
special_tokens = {}
for token in specials:
special_tokens[token] = n_vocab
n_vocab += 1
special_tokens = dict(zip(specials, itertools.count(n_vocab)))
n_vocab += len(specials)
assert n_vocab == n_vocab_in
import tiktoken
return tiktoken.Encoding(