{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "jupyter": { "is_executing": true } }, "source": [ "#第2章/加载编码工具\n", "from transformers import BertTokenizer\n", "tokenizer = BertTokenizer.from_pretrained(\n", "pretrained_model_name_or_path='bert-base-chinese',\n", "cache_dir=None,\n", "force_download=False,\n", ")\n", "\n", "#第2章/准备实验数据\n", "sents = [\n", "'你站在桥上看风景',\n", "'看风景的人在楼上看你',\n", "'明月装饰了你的窗子',\n", "'你装饰了别人的梦',\n", "]\n", "\n", "#第2章/基本的编码函数\n", "out = tokenizer.encode(\n", "text=sents[0],\n", "text_pair=sents[1],\n", "#当句子长度大于max_length时截断\n", "truncation=True,\n", "#一律补PAD,直到max_length长度\n", "padding='max_length',\n", "add_special_tokens=True,\n", "max_length=25,\n", "return_tensors=None,\n", ")\n", "print(out)\n", "print(tokenizer.decode(out))" ], "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "1c21ec1248b8a72a" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }