",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"100257"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# 文档结束标记"}]]}],"\n",["$","span",null,{"className":"line","children":[["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" \"<|fim_prefix|>\""}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":": "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"100258"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# Fill-in-the-middle 前缀"}]]}],"\n","$Lf8","\n","$Lf9","\n","$Lfa","\n","$Lfb","\n","$Lfc","\n","$Lfd","\n","$Lfe","\n","$Lff","\n","$L100"]}]}]}] 7b:["$","p",null,{"children":"输出："}] 7c:["$","$L43",null,{"className":"shiki shiki-themes github-light github-dark","style":{"--shiki-light":"#24292e","--shiki-dark":"#e1e4e8","--shiki-light-bg":"#fff","--shiki-dark-bg":"#24292e"},"tabIndex":"0","icon":"","children":["$","$L90",null,{"children":["$","code",null,{"children":["$","span",null,{"className":"line","children":["$","span",null,{"children":"[9906, 100257, 10343]"}]}]}]}]}] 7d:["$","p",null,{"children":["可以看到 ",["$","code",null,{"children":"<|endoftext|>"}]," 被编码为单个 token ",["$","code",null,{"children":"100257"}],"，而不是被拆分成多个普通 tokens。"]}] 7e:["$","p",null,{"children":"特殊 tokens 的作用："}] 7f:["$","ul",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"文档分隔"}],"：",["$","code",null,{"children":"<|endoftext|>"}]," 用于分隔不同的文档"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"任务标记"}],"：",["$","code",null,{"children":"<|fim_*|>"}]," 用于代码补全任务"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"控制信号"}],"：",["$","code",null,{"children":"<|endofprompt|>"}]," 用于区分 prompt 和生成内容"]}],"\n"]}] 80:["$","h3",null,{"className":"flex scroll-m-28 flex-row items-center gap-2","id":"处理特殊-token","children":[["$","a",null,{"data-card":"","href":"#处理特殊-token","className":"peer","children":"处理特殊 Token"}],["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide size-3.5 shrink-0 text-fd-muted-foreground opacity-0 transition-opacity peer-hover:opacity-100","aria-label":"Link to section","children":[[["$","path","1cjeqo",{"d":"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"}],["$","path","19qd67",{"d":"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"}]],"$undefined"]}]]}] 101:T5c0,

GPT 系列 Tokenizer

登录以继续阅读

目录

GPT 系列 Tokenizer

登录以继续阅读

目录