The tokenization unit in CS336 Lecture 01 was inspired by Andrej Karpathy’s video on tokenization:Let’s build the GPT Tokenizer
为了感受分词器的工作原理,可以试试这个互动网站.
defget_gpt2_tokenizer(): # Code: https://github.com/openai/tiktoken # You can use cl100k_base for the gpt3.5-turbo or gpt4 tokenizer return tiktoken.get_encoding("gpt2")
deftrain_bpe(string: str, num_merges: int) -> BPETokenizerParams: # @inspect string, @inspect num_merges # Start with the list of bytes of string. indices = list(map(int, string.encode("utf-8"))) # @inspect indices merges: dict[tuple[int, int], int] = {} # index1, index2 => merged index vocab: dict[int, bytes] = {x: bytes([x]) for x inrange(256)} # index -> bytes for i inrange(num_merges): # Count the number of occurrences of each pair of tokens counts = defaultdict(int) for index1, index2 inzip(indices, indices[1:]): # For each adjacent pair counts[(index1, index2)] += 1# @inspect counts # Find the most common pair. pair = max(counts, key=counts.get) # @inspect pair index1, index2 = pair # Merge that pair. new_index = 256 + i # @inspect new_index merges[pair] = new_index # @inspect merges vocab[new_index] = vocab[index1] + vocab[index2] # @inspect vocab indices = merge(indices, pair, new_index) # @inspect indices return BPETokenizerParams(vocab=vocab, merges=merges) defmerge(indices: list[int], pair: tuple[int, int], new_index: int) -> list[int]: # @inspect indices, @inspect pair, @inspect new_index """Return `indices`, but with all instances of `pair` replaced with `new_index`.""" new_indices = [] # @inspect new_indices i = 0# @inspect i while i < len(indices): if i + 1 < len(indices) and indices[i] == pair[0] and indices[i + 1] == pair[1]: new_indices.append(new_index) i += 2 else: new_indices.append(indices[i]) i += 1 return new_indices