以下是如何使用 tiktoken 计算 token 数量的具体步骤:
0. 安装 tiktoken
!pip install --upgrade tiktoken
1. 导入 tiktoken
import tiktoken
2. 加载编码方式
encoding = tiktoken.get_encoding("cl100k_base") # 或者 encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
3. 将文本转换为 token
tokens = encoding.encode("tiktoken is great!") print(tokens) # 输出: [83, 1609, 5963, 374, 2294, 0]
4. 计算 token 数量
def num_tokens_from_string(string: str, encoding_name: str) -> int: """返回文本字符串中的Token数量""" encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) return num_tokens print(num_tokens_from_string("tiktoken is great!", "cl100k_base")) # 输出: 6
5. 将 token 转换为文本
decoded_text = encoding.decode([83, 1609, 5963, 374, 2294, 0]) print(decoded_text) # 输出: 'tiktoken is great!'
6. 比较不同编码方式的分词结果
def compare_encodings(example_string: str) -> None: """比较不同编码方式下的分词结果""" print(f'\nExample string: "{example_string}"') for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]: encoding = tiktoken.get_encoding(encoding_name) token_integers = encoding.encode(example_string) num_tokens = len(token_integers) token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers] print(f"{encoding_name}: {num_tokens} tokens") print(f"token integers: {token_integers}") print(f"token bytes: {token_bytes}") compare_encodings("antidisestablishmentarianism") compare_encodings("2 + 2 = 4") compare_encodings("お誕生日おめでとう")
7. 计算 ChatGPT API 调用的 token 数量
def num_tokens_from_messages(messages, model="gpt-3.5-turbo"): """计算消息列表中的 token 数量""" encoding = tiktoken.encoding_for_model(model) tokens_per_message = 3 if model in {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613"} else 4 tokens_per_name = 1 if model != "gpt-3.5-turbo-0301" else -1 num_tokens = 0 for message in messages: num_tokens += tokens_per_message for key, value in message.items(): num_tokens += len(encoding.encode(value)) if key == "name": num_tokens += tokens_per_name num_tokens += 3 # 每个对话以助手开头的额外token return num_tokens example_messages = [ {"role": "system", "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."}, {"role": "user", "content": "New synergies will help drive top-line growth."}, {"role": "assistant", "content": "Things working well together will increase revenue."}, {"role": "user", "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage."}, {"role": "assistant", "content": "Let's talk later when we're less busy about how to do better."}, {"role": "user", "content": "This late pivot means we don't have time to boil the ocean for the client deliverable."}, ] print(num_tokens_from_messages(example_messages, model="gpt-3.5-turbo-0613"))
通过以上步骤,可以高效地使用 tiktoken 进行 token 的编码、解码及统计工作。