5E89D","--shiki-dark-font-weight":"bold"},"children":"\\r\\n\\p"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"{L}"}],["$","span",null,{"style":{"--shiki-light":"#22863A","--shiki-light-font-weight":"bold","--shiki-dark":"#85E89D","--shiki-dark-font-weight":"bold"},"children":"\\p"}],"$L79","$L7a","$L7b","$L7c","$L7d","$L7e","$L7f","$L80","$L81","$L82","$L83","$L84","$L85","$L86","$L87","$L88","$L89","$L8a","$L8b","$L8c","$L8d","$L8e","$L8f","$L90","$L91","$L92","$L93","$L94","$L95","$L96","$L97","$L98","$L99","$L9a"]}]}]}]}] 5b:["$","p",null,{"children":["它接近 GPT-4 风格的正则预分词，但对数字做了一个小调整：使用 ",["$","code",null,{"children":"\\p{N}{1,2}"}],"，让数字最多按两位一组切分。这样可以减少小词表里数字 token 占用过多空间的问题。"]}] 5c:["$","h2",null,{"className":"flex scroll-m-28 flex-row items-center gap-2","id":"5-特殊-token","children":[["$","a",null,{"data-card":"","href":"#5-特殊-token","className":"peer","children":"5. 特殊 token"}],["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide size-3.5 shrink-0 text-fd-muted-foreground opacity-0 transition-opacity peer-hover:opacity-100","aria-label":"Link to section","children":[[["$","path","1cjeqo",{"d":"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"}],["$","path","19qd67",{"d":"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"}]],"$undefined"]}]]}] 5d:["$","p",null,{"children":[["$","code",null,{"children":"cookllm-bento"}]," 定义了 17 个特殊 token："]}] 5e:["$","ul",null,{"children":["\n",["$","li",null,{"children":["文本边界：",["$","code",null,{"children":"<|endoftext|>"}]]}],"\n",["$","li",null,{"children":["ChatML：",["$","code",null,{"children":"<|im_start|>"}],"、",["$","code",null,{"children":"<|im_end|>"}]]}],"\n",["$","li",null,{"children":["思考与答案：",["$","code",null,{"children":""}],"、",["$","code",null,{"children":""}],"、",["$","code",null,{"children":""}]]}],"\n",["$","li",null,{"children":["多模态占位：",["$","code",null,{"children":"<|vision_start|>"}],"、",["$","code",null,{"children":"<|vision_end|>"}],"、",["$","code",null,{"children":"<|image_pad|>"}]]}],"\n",["$","li",null,{"children":["Grounding：",["$","code",null,{"children":"<|object_ref_start|>"}],"、",["$","code",null,{"children":"<|object_ref_end|>"}],"、",["$","code",null,{"children":"<|box_start|>"}],"、",["$","code",null,{"children":"<|box_end|>"}]]}],"\n",["$","li",null,{"children":["工具调用：",["$","code",null,{"children":""}],"、",["$","code",null,{"children":""}],"、",["$","code",null,{"children":""}],"、",["$","code",null,{"children":""}]]}],"\n"]}] 5f:["$","p",null,{"children":["训练时的 ",["$","code",null,{"children":"--vocab-size 8192"}]," 是总词表大小。脚本会先用 ",["$","code",null,{"children":"8192 - 17"}]," 个位置训练普通 BPE token，再把 17 个特殊 token 追加到词表末尾。"]}] 60:["$","h2",null,{"className":"flex scroll-m-28 flex-row items-center gap-2","id":"6-训练后的检查","children":[["$","a",null,{"data-card":"","href":"#6-训练后的检查","className":"peer","children":"6. 训练后的检查"}],["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide size-3.5 shrink-0 text-fd-muted-foreground opacity-0 transition-opacity peer-hover:opacity-100","aria-label":"Link to section","children":[[["$","path","1cjeqo",{"d":"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"}],["$","path","19qd67",{"d":"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"}]],"$undefined"]}]]}] 61:["$",

Tokenizer 训练

内容正在烹饪中...

目录