如何使用 tiktoken 计算 token 数量的具体步骤

47 min read

以下是如何使用 tiktoken 计算 token 数量的具体步骤:

0. 安装 tiktoken

!pip install --upgrade tiktoken

1. 导入 tiktoken

import tiktoken

2. 加载编码方式

encoding = tiktoken.get_encoding("cl100k_base")
# 或者
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

3. 将文本转换为 token

tokens = encoding.encode("tiktoken is great!")
print(tokens)  # 输出: [83, 1609, 5963, 374, 2294, 0]

4. 计算 token 数量

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """返回文本字符串中的Token数量"""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string("tiktoken is great!", "cl100k_base"))  # 输出: 6

5. 将 token 转换为文本

decoded_text = encoding.decode([83, 1609, 5963, 374, 2294, 0])
print(decoded_text)  # 输出: 'tiktoken is great!'

6. 比较不同编码方式的分词结果

def compare_encodings(example_string: str) -> None:
    """比较不同编码方式下的分词结果"""
    print(f'\nExample string: "{example_string}"')
    for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")

compare_encodings("antidisestablishmentarianism")
compare_encodings("2 + 2 = 4")
compare_encodings("お誕生日おめでとう")

7. 计算 ChatGPT API 调用的 token 数量

def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
    """计算消息列表中的 token 数量"""
    encoding = tiktoken.encoding_for_model(model)
    tokens_per_message = 3 if model in {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"} else 4
    tokens_per_name = 1 if model != "gpt-3.5-turbo-0301" else -1

    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # 每个对话以助手开头的额外token
    return num_tokens

example_messages = [
    {"role": "system", "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."},
    {"role": "user", "content": "New synergies will help drive top-line growth."},
    {"role": "assistant", "content": "Things working well together will increase revenue."},
    {"role": "user", "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage."},
    {"role": "assistant", "content": "Let's talk later when we're less busy about how to do better."},
    {"role": "user", "content": "This late pivot means we don't have time to boil the ocean for the client deliverable."},
]

print(num_tokens_from_messages(example_messages, model="gpt-3.5-turbo-0613"))

通过以上步骤,可以高效地使用 tiktoken 进行 token 的编码、解码及统计工作。