diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..cfcb06bd85b01c1939ab7f25e1b484e1c079825e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+final_test.json filter=lfs diff=lfs merge=lfs -text
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..ac4aee55c0698300d21541d5395d452016585f7a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright Zhengxiao Du
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/MODEL_LICENSE b/MODEL_LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a43fae25a4146b784f761be6db87fbb18d783fa7
--- /dev/null
+++ b/MODEL_LICENSE
@@ -0,0 +1,33 @@
+The GLM-130B License
+
+1. Definitions
+
+“Licensor” means the GLM-130B Model Team that distributes its Software.
+
+“Software” means the GLM-130B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
\ No newline at end of file
diff --git a/PROJECT.md b/PROJECT.md
new file mode 100644
index 0000000000000000000000000000000000000000..47982597d1af538e31af87d1286b17e4b072cd87
--- /dev/null
+++ b/PROJECT.md
@@ -0,0 +1,18 @@
+# 友情链接
+
+以下是部分基于本仓库开发的开源项目：
+* [SwissArmyTransformer](https://github.com/THUDM/SwissArmyTransformer): 一个Transformer统一编程框架，ChatGLM-6B已经在SAT中进行实现并可以进行P-tuning微调。
+* [ChatGLM-MNN](https://github.com/wangzhaode/ChatGLM-MNN): 一个基于 MNN 的 ChatGLM-6B C++ 推理实现，支持根据显存大小自动分配计算任务给 GPU 和 CPU
+* [ChatGLM-Tuning](https://github.com/mymusise/ChatGLM-Tuning): 基于 LoRA 对 ChatGLM-6B 进行微调。类似的项目还包括 [Humanable ChatGLM/GPT Fine-tuning | ChatGLM 微调](https://github.com/hscspring/hcgf)
+* [langchain-ChatGLM](https://github.com/imClumsyPanda/langchain-ChatGLM)：基于本地知识的 ChatGLM 应用，基于LangChain
+* [bibliothecarius](https://github.com/coderabbit214/bibliothecarius)：快速构建服务以集成您的本地数据和AI模型，支持ChatGLM等本地化模型接入。
+* [闻达](https://github.com/l15y/wenda)：大型语言模型调用平台，基于 ChatGLM-6B 实现了类 ChatPDF 功能
+* [JittorLLMs](https://github.com/Jittor/JittorLLMs)：最低3G显存或者没有显卡都可运行 ChatGLM-6B FP16， 支持Linux、windows、Mac部署
+* [ChatGLM-Finetuning](https://github.com/liucongg/ChatGLM-Finetuning)：基于ChatGLM-6B模型，进行下游具体任务微调，涉及Freeze、Lora、P-tuning等，并进行实验效果对比。
+* [InstructGLM](https://github.com/yanqiangmiffy/InstructGLM)：基于ChatGLM-6B进行指令学习，汇总开源中英文指令数据，基于Lora进行指令数据微调，开放了Alpaca、Belle微调后的Lora权重，修复web_demo重复问题
+* [ChatGLM-web](https://github.com/NCZkevin/chatglm-web)：基于FastAPI和Vue3搭建的ChatGLM演示网站(支持chatglm流式输出、前端调整模型参数、上下文选择、保存图片、知识库问答等功能) 
+* [glm-bot](https://github.com/initialencounter/glm-bot)：将ChatGLM接入Koishi可在各大聊天平台上调用ChatGLM
+
+以下是部分针对本项目的教程/文档：
+* [Windows部署文档](https://github.com/ZhangErling/ChatGLM-6B/blob/main/deployment_windows.md)
+* [ChatGLM-6B 的部署与微调教程 @ModelWhale平台](https://www.heywhale.com/mw/project/6436d82948f7da1fee2be59e)
diff --git a/README.md b/README.md
index e7d90ddb789e264c5ea4a99dccd0b911883df4c5..def8e04a1fb779c5990562dd1a8d1a0646230aee 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,85 @@
 ---
-license: unknown
+language:
+- zh
+- en
+tags:
+- glm
+- chatglm
+- thudm
 ---
+# ChatGLM-6B
+<p align="center">
+   🌐 <a href="https://chatglm.cn/blog" target="_blank">Blog</a> • 💻 <a href="https://github.com/THUDM/ChatGLM-6B" target="_blank">Github Repo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2210.02414" target="_blank">[GLM-130B@ICLR 23]</a> <a href="https://github.com/THUDM/GLM-130B" target="_blank">[GitHub]</a> <br>
+</p>
+
+<p align="center">
+    👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1th2q5u69-7tURzFuOPanmuHy9hsZnKA" target="_blank">Slack</a> and <a href="https://github.com/THUDM/ChatGLM-6B/blob/main/resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+
+## 介绍
+ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型，基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构，具有 62 亿参数。结合模型量化技术，用户可以在消费级的显卡上进行本地部署（INT4 量化级别下最低只需 6GB 显存）。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术，针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练，辅以监督微调、反馈自助、人类反馈强化学习等技术的加持，62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。
+
+ChatGLM-6B is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework, with 6.2 billion parameters. With the quantization technique, users can deploy locally on consumer-grade graphics cards (only 6GB of GPU memory is required at the INT4 quantization level). ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue. The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning, feedback bootstrap, and reinforcement learning wit human feedback. With only about 6.2 billion parameters, the model is able to generate answers that are in line with human preference.
+
+## 软件依赖
+
+```shell
+pip install protobuf==3.20.0 transformers==4.27.1 icetk cpm_kernels
+```
+
+## 代码调用 
+
+可以通过如下代码调用 ChatGLM-6B 模型来生成对话：
+
+```ipython
+>>> from transformers import AutoTokenizer, AutoModel
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+>>> model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+>>> response, history = model.chat(tokenizer, "你好", history=[])
+>>> print(response)
+你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。
+>>> response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
+>>> print(response)
+晚上睡不着可能会让你感到焦虑或不舒服,但以下是一些可以帮助你入睡的方法:
+
+1. 制定规律的睡眠时间表:保持规律的睡眠时间表可以帮助你建立健康的睡眠习惯,使你更容易入睡。尽量在每天的相同时间上床,并在同一时间起床。
+2. 创造一个舒适的睡眠环境:确保睡眠环境舒适,安静,黑暗且温度适宜。可以使用舒适的床上用品,并保持房间通风。
+3. 放松身心:在睡前做些放松的活动,例如泡个热水澡,听些轻柔的音乐,阅读一些有趣的书籍等,有助于缓解紧张和焦虑,使你更容易入睡。
+4. 避免饮用含有咖啡因的饮料:咖啡因是一种刺激性物质,会影响你的睡眠质量。尽量避免在睡前饮用含有咖啡因的饮料,例如咖啡,茶和可乐。
+5. 避免在床上做与睡眠无关的事情:在床上做些与睡眠无关的事情,例如看电影,玩游戏或工作等,可能会干扰你的睡眠。
+6. 尝试呼吸技巧:深呼吸是一种放松技巧,可以帮助你缓解紧张和焦虑,使你更容易入睡。试着慢慢吸气,保持几秒钟,然后缓慢呼气。
+
+如果这些方法无法帮助你入睡,你可以考虑咨询医生或睡眠专家,寻求进一步的建议。
+```
+
+关于更多的使用说明，包括如何运行命令行和网页版本的 DEMO，以及使用模型量化以节省显存，请参考我们的 [Github Repo](https://github.com/THUDM/ChatGLM-6B)。
+
+For more instructions, including how to run CLI and web demos, and model quantization, please refer to our [Github Repo](https://github.com/THUDM/ChatGLM-6B).
+
+## 协议
+
+本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源，ChatGLM-6B 模型的权重的使用则需要遵循 [Model License](MODEL_LICENSE)。
+
+## 引用
+
+如果你觉得我们的工作有帮助的话，请考虑引用下列论文：
+
+```
+@inproceedings{
+  zeng2023glm-130b,
+  title={{GLM}-130B: An Open Bilingual Pre-trained Model},
+  author={Aohan Zeng and Xiao Liu and Zhengxiao Du and Zihan Wang and Hanyu Lai and Ming Ding and Zhuoyi Yang and Yifan Xu and Wendi Zheng and Xiao Xia and Weng Lam Tam and Zixuan Ma and Yufei Xue and Jidong Zhai and Wenguang Chen and Zhiyuan Liu and Peng Zhang and Yuxiao Dong and Jie Tang},
+  booktitle={The Eleventh International Conference on Learning Representations (ICLR)},
+  year={2023},
+  url={https://openreview.net/forum?id=-Aw0rrrPUF}
+}
+```
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+```
\ No newline at end of file
diff --git a/README_en.md b/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..93b2ee29c24d71dd83fd91a59a0bc46c552856f1
--- /dev/null
+++ b/README_en.md
@@ -0,0 +1,275 @@
+# ChatGLM-6B
+
+
+<p align="center">
+   🌐 <a href="https://chatglm.cn/blog" target="_blank">Blog</a> • 🤗 <a href="https://huggingface.co/THUDM/chatglm-6b" target="_blank">HF Repo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2210.02414" target="_blank">[GLM-130B@ICLR 23]</a> <a href="https://github.com/THUDM/GLM-130B" target="_blank">[GitHub]</a> <br>
+</p>
+<p align="center">
+    👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1th2q5u69-7tURzFuOPanmuHy9hsZnKA" target="_blank">Slack</a> and <a href="resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+
+## Introduction
+
+ChatGLM-6B is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework, with 6.2 billion parameters. With the quantization technique, users can deploy locally on consumer-grade graphics cards (only 6GB of GPU memory is required at the INT4 quantization level).
+
+ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue. The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning, feedback bootstrap, and reinforcement learning wit human feedback. With only about 6.2 billion parameters, the model is able to generate answers that are in line with human preference.
+
+Try the [online demo](https://huggingface.co/spaces/ysharma/ChatGLM-6b_Gradio_Streaming) on Huggingface Spaces.
+
+## Update
+**[2023/03/31]** Added a parameter-efficient tuning implementation based on [P-Tuning-v2](https://github.com/THUDM/P-tuning-v2). The minimum INT4 quantization level only needs 7GB GPU memory is enough for model tuning. See [Parameter-efficient tuning method](ptuning/README.md) for details.
+
+**[2023/03/23]** Add API deployment, thanks to [@LemonQu-GIT](https://github.com/LemonQu-GIT). Add embedding-quantized model [ChatGLM-6B-INT4-QE](https://huggingface.co/THUDM/chatglm-6b-int4-qe). Add support for GPU inference on Mac with Apple Silicon.
+
+**[2023/03/19]** Add streaming output function `stream_chat`, already applied in web and CLI demo. Fix Chinese punctuations in output. Add quantized model [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4). 
+
+## Projects
+The following are some open source projects developed based on this repository:
+* [ChatGLM-MNN](https://github.com/wangzhaode/ChatGLM-MNN): An [MNN](https://github.com/alibaba/MNN)-based implementation of ChatGLM-6B C++ inference, which supports automatic allocation of computing tasks to GPU and CPU according to the size of GPU memory
+* [ChatGLM-Tuning](https://github.com/mymusise/ChatGLM-Tuning): Fine-tuning ChatGLM-6B based on LoRA
+
+If you have other good projects, please refer to the above format to add to README and propose [PR](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork).
+
+## Getting Started
+
+### Hardware Requirements
+
+| **Quantization Level** | **GPU Memory** |
+|------------------------|----------------|
+| FP16（no quantization）  | 13 GB          |
+| INT8                   | 10 GB          |
+| INT4                   | 6 GB           |
+
+### Environment Setup
+
+Install the requirements with pip: `pip install -r requirements.txt`. `transformers` library version is recommended to be `4.27.1`, but theoretically any version no lower than `4.23.1` is acceptable.
+
+In addition, if you need to run the quantified model on the CPU, you also need to install `gcc` and `openmp`. Most Linux distributions are installed by default. For Windows, you can check `openmp` when installing [TDM-GCC](https://jmeubank.github.io/tdm-gcc/). On Windows testing environment, the `gcc` version is `TDM-GCC 10.3.0`, and on Linux is `gcc 11.3.0`.
+
+### Usage
+
+Generate dialogue with the following code
+
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+>>> model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+>>> model = model.eval()
+>>> response, history = model.chat(tokenizer, "你好", history=[])
+>>> print(response)
+你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。
+>>> response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
+>>> print(response)
+晚上睡不着可能会让你感到焦虑或不舒服,但以下是一些可以帮助你入睡的方法:
+
+1. 制定规律的睡眠时间表:保持规律的睡眠时间表可以帮助你建立健康的睡眠习惯,使你更容易入睡。尽量在每天的相同时间上床,并在同一时间起床。
+2. 创造一个舒适的睡眠环境:确保睡眠环境舒适,安静,黑暗且温度适宜。可以使用舒适的床上用品,并保持房间通风。
+3. 放松身心:在睡前做些放松的活动,例如泡个热水澡,听些轻柔的音乐,阅读一些有趣的书籍等,有助于缓解紧张和焦虑,使你更容易入睡。
+4. 避免饮用含有咖啡因的饮料:咖啡因是一种刺激性物质,会影响你的睡眠质量。尽量避免在睡前饮用含有咖啡因的饮料,例如咖啡,茶和可乐。
+5. 避免在床上做与睡眠无关的事情:在床上做些与睡眠无关的事情,例如看电影,玩游戏或工作等,可能会干扰你的睡眠。
+6. 尝试呼吸技巧:深呼吸是一种放松技巧,可以帮助你缓解紧张和焦虑,使你更容易入睡。试着慢慢吸气,保持几秒钟,然后缓慢呼气。
+
+如果这些方法无法帮助你入睡,你可以考虑咨询医生或睡眠专家,寻求进一步的建议。
+```
+
+The full model implementation is on [HuggingFace Hub](https://huggingface.co/THUDM/chatglm-6b).
+
+### Demo
+
+We provide a Web demo based on [Gradio](https://gradio.app) and a command line demo in the repo. First clone our repo with:
+
+```shell
+git clone https://github.com/THUDM/ChatGLM-6B
+cd ChatGLM-6B
+```
+
+#### Web Demo
+
+![web-demo](resources/web-demo.png)
+
+Install Gradio `pip install gradio`，and run [web_demo.py](web_demo.py):
+
+```shell
+python web_demo.py
+```
+
+The program runs a web server and outputs the URL. Open the URL in the browser to use the web demo.
+
+#### CLI Demo
+
+![cli-demo](resources/cli-demo.png)
+
+Run [cli_demo.py](cli_demo.py) in the repo:
+
+```shell
+python cli_demo.py
+```
+
+The command runs an interactive program in the shell. Type your instruction in the shell and hit enter to generate the response. Type `clear` to clear the dialogue history and `stop` to terminate the program.
+
+## API Deployment
+First install the additional dependency `pip install fastapi uvicorn`. The run [api.py](api.py) in the repo.
+```shell
+python api.py
+```
+By default the api runs at the`8000`port of the local machine. You can call the API via 
+```shell
+curl -X POST "http://127.0.0.1:8000" \
+     -H 'Content-Type: application/json' \
+     -d '{"prompt": "你好", "history": []}'
+```
+The returned value is
+```shell
+{
+  "response":"你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。",
+  "history":[["你好","你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。"]],
+  "status":200,
+  "time":"2023-03-23 21:38:40"
+}
+```
+
+## Deployment
+
+### Quantization
+
+By default, the model parameters are loaded with FP16 precision, which require about 13GB of GPU memory. It your GPU memory is limited, you can try to load the model parameters with quantization:
+
+```python
+# Change according to your hardware. Only support 4/8 bit quantization now.
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().quantize(4).cuda()
+```
+
+After 2 to 3 rounds of dialogue, the GPU memory usage is about 10GB under 8-bit quantization, and only 6GB under 4-bit quantization. As the number of dialogue rounds increases, the corresponding GPU memory consumption also increases. Due to the use of relative position encoding, ChatGLM-6B theoretically supports an infinitely long context-length, but the performance will gradually decline after the total length exceeds 2048 (training length).
+
+Model quantization brings a certain performance decline. After testing, ChatGLM-6B can still perform natural and smooth generation under 4-bit quantization. using [GPT-Q](https://arxiv.org/abs/2210.17323) etc. The quantization scheme can further compress the quantization accuracy/improve the model performance under the same quantization accuracy. You are welcome to submit corresponding Pull Requests.
+
+**[2023/03/19]** The quantization costs about 13GB of CPU memory to load the FP16 model. If your CPU memory is limited, you can directly load the quantized model, which costs only 5.2GB CPU memory: 
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).half().cuda()
+```
+
+### CPU Deployment
+
+If your computer is not equipped with GPU, you can also conduct inference on CPU, but the inference speed is slow (and taking about 32GB of memory):
+
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float()
+```
+
+**[2023/03/19]** If your CPU memory is limited, you can directly load the quantized model:
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).float()
+```
+
+If your encounter the error `Could not find module 'nvcuda.dll'` or `RuntimeError: Unknown platform: darwin`(MacOS), please refer to this [Issue](https://github.com/THUDM/ChatGLM-6B/issues/6#issuecomment-1470060041). 
+
+### GPU Inference on Mac
+For Macs (and MacBooks) with Apple Silicon, it is possible to use the MPS backend to run ChatGLM-6B on the GPU. First, you need to refer to Apple's [official instructions](https://developer.apple.com/metal/pytorch) to install PyTorch-Nightly. Then clone the model repository locally (you need to [install Git LFS](https://docs.github.com/zh/repositories/working-with-files/managing-large-files/installing-git-large-file-storage)）
+```shell
+git lfs install
+git clone https://huggingface.co/THUDM/chatglm-6b
+```
+Change the code to load the model from your local path, and use the mps backend:
+```python
+model = AutoModel.from_pretrained("your local path", trust_remote_code=True).half().to('mps')
+```
+Then you can use GPU-accelerated model inference on Mac.
+
+### Multi-GPU Deployment
+If you have multiple GPUs, but the memory size of each GPU is not sufficient to accommodate the entire model, you can split the model across multiple GPUs. 
+
+First, install accelerate: `pip install accelerate`, and then load the model using the following method:
+```python
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
+```
+
+This will deploy the model onto two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. By default, the model is split evenly, but you can also specify the `device_map` parameter to customize the splitting.
+
+## Parameter-efficient Tuning
+Parameter-efficient tuning based on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). See [ptuning/README.md](ptuning/README.md) for details on how to use it.
+
+## ChatGLM-6B Examples
+
+The following are some Chinese examples with `web_demo.py`. Welcome to explore more possibility with ChatGLM-6B.
+
+<details><summary><b>Self Cognition</b></summary>
+
+![](examples/self-introduction.png)
+
+</details>
+
+<details><summary><b>Outline</b></summary>
+
+![](examples/blog-outline.png)
+
+</details>
+
+<details><summary><b>Ad</b></summary>
+
+![](examples/ad-writing-2.png)
+
+![](examples/comments-writing.png)
+
+</details>
+
+<details><summary><b>Email</b></summary>
+
+![](examples/email-writing-1.png)
+
+![](examples/email-writing-2.png)
+
+</details>
+
+<details><summary><b>Information Extraction</b></summary>
+
+![](examples/information-extraction.png)
+
+</details>
+
+<details><summary><b>Role Play</b></summary>
+
+![](examples/role-play.png)
+
+</details>
+
+<details><summary><b>Comparison</b></summary>
+
+![](examples/sport.png)
+
+</details>
+
+<details><summary><b>Travel Guide</b></summary>
+
+![](examples/tour-guide.png)
+
+</details>
+
+## License
+
+This repository is licensed under the [Apache-2.0 License](LICENSE). The use of ChatGLM-6B model weights is subject to the [Model License](MODEL_LICENSE)。
+
+## Citation
+
+If you find our work useful, please consider citing the following papers:
+
+```
+@inproceedings{
+  zeng2023glm-130b,
+  title={{GLM}-130B: An Open Bilingual Pre-trained Model},
+  author={Aohan Zeng and Xiao Liu and Zhengxiao Du and Zihan Wang and Hanyu Lai and Ming Ding and Zhuoyi Yang and Yifan Xu and Wendi Zheng and Xiao Xia and Weng Lam Tam and Zixuan Ma and Yufei Xue and Jidong Zhai and Wenguang Chen and Zhiyuan Liu and Peng Zhang and Yuxiao Dong and Jie Tang},
+  booktitle={The Eleventh International Conference on Learning Representations (ICLR)},
+  year={2023},
+  url={https://openreview.net/forum?id=-Aw0rrrPUF}
+}
+```
+
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+```
diff --git a/api.py b/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..693c70acc4adf397375ea8b24660f9592072809f
--- /dev/null
+++ b/api.py
@@ -0,0 +1,56 @@
+from fastapi import FastAPI, Request
+from transformers import AutoTokenizer, AutoModel
+import uvicorn, json, datetime
+import torch
+
+DEVICE = "cuda"
+DEVICE_ID = "0"
+CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
+
+
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device(CUDA_DEVICE):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+
+
+app = FastAPI()
+
+
+@app.post("/")
+async def create_item(request: Request):
+    global model, tokenizer
+    json_post_raw = await request.json()
+    json_post = json.dumps(json_post_raw)
+    json_post_list = json.loads(json_post)
+    prompt = json_post_list.get('prompt')
+    history = json_post_list.get('history')
+    max_length = json_post_list.get('max_length')
+    top_p = json_post_list.get('top_p')
+    temperature = json_post_list.get('temperature')
+    response, history = model.chat(tokenizer,
+                                   prompt,
+                                   history=history,
+                                   max_length=max_length if max_length else 2048,
+                                   top_p=top_p if top_p else 0.7,
+                                   temperature=temperature if temperature else 0.95)
+    now = datetime.datetime.now()
+    time = now.strftime("%Y-%m-%d %H:%M:%S")
+    answer = {
+        "response": response,
+        "history": history,
+        "status": 200,
+        "time": time
+    }
+    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
+    print(log)
+    torch_gc()
+    return answer
+
+
+if __name__ == '__main__':
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+    model.eval()
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
diff --git a/cli_demo.py b/cli_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7247456c7a814cc272506b4947ee3d29fa8975b8
--- /dev/null
+++ b/cli_demo.py
@@ -0,0 +1,57 @@
+import os
+import platform
+import signal
+from transformers import AutoTokenizer, AutoModel
+
+tokenizer = AutoTokenizer.from_pretrained("G:\CODE\Python\ChatGLM-6B-main", trust_remote_code=True)
+model = AutoModel.from_pretrained("G:\CODE\Python\ChatGLM-6B-main", trust_remote_code=True).quantize(4).half().cuda()
+model = model.eval()
+
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+
+
+def build_prompt(history):
+    prompt = "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM-6B：{response}"
+    return prompt
+
+
+def signal_handler(signal, frame):
+    global stop_stream
+    stop_stream = True
+
+
+def main():
+    history = []
+    global stop_stream
+    print("欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            history = []
+            os.system(clear_command)
+            print("欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        count = 0
+        for response, history in model.stream_chat(tokenizer, query, history=history):
+            if stop_stream:
+                stop_stream = False
+                break
+            else:
+                count += 1
+                if count % 8 == 0:
+                    os.system(clear_command)
+                    print(build_prompt(history), flush=True)
+                    signal.signal(signal.SIGINT, signal_handler)
+        os.system(clear_command)
+        print(build_prompt(history), flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ab3caf549118f7baa491467f459eea52c555220
--- /dev/null
+++ b/config.json
@@ -0,0 +1,28 @@
+{
+  "_name_or_path": "THUDM/chatglm-6b",
+  "architectures": [
+    "ChatGLMModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
+  },
+  "bos_token_id": 130004,
+  "eos_token_id": 130005,
+  "mask_token_id": 130000,
+  "gmask_token_id": 130001,
+  "pad_token_id": 3,
+  "hidden_size": 4096,
+  "inner_hidden_size": 16384,
+  "layernorm_epsilon": 1e-05,
+  "max_sequence_length": 2048,
+  "model_type": "chatglm",
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "position_encoding_2d": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.23.1",
+  "use_cache": true,
+  "vocab_size": 130528
+}
diff --git a/configuration_chatglm.py b/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..78f3425d5f63ad43f31b092b8d62b44d28d52f15
--- /dev/null
+++ b/configuration_chatglm.py
@@ -0,0 +1,103 @@
+""" ChatGLM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class ChatGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
+    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150528):
+            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ChatGLMModel`] or
+            [`~TFChatGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        inner_hidden_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        max_sequence_length (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from configuration_chatglm import ChatGLMConfig
+    >>> from modeling_chatglm import ChatGLMModel
+
+    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
+    >>> configuration = ChatGLMConfig()
+
+    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
+    >>> model = ChatGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "chatglm"
+
+    def __init__(
+            self,
+            vocab_size=150528,
+            hidden_size=4096,
+            num_layers=28,
+            num_attention_heads=32,
+            layernorm_epsilon=1e-5,
+            use_cache=False,
+            bos_token_id=150004,
+            eos_token_id=150005,
+            mask_token_id=150000,
+            gmask_token_id=150001,
+            pad_token_id=0,
+            max_sequence_length=2048,
+            inner_hidden_size=16384,
+            position_encoding_2d=True,
+            quantization_bit=0,
+            pre_seq_len=None,
+            prefix_projection=False,
+            **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.layernorm_epsilon = layernorm_epsilon
+        self.inner_hidden_size = inner_hidden_size
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.gmask_token_id = gmask_token_id
+        self.position_encoding_2d = position_encoding_2d
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
diff --git a/final_test.json b/final_test.json
new file mode 100644
index 0000000000000000000000000000000000000000..36716899e22b3ec6a36d55da3dfaade1646fe7e5
--- /dev/null
+++ b/final_test.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:966a240037bb80399d4728d89922f6390a08c2f27641d427283faa33fb22bab2
+size 50682846
diff --git a/ice_text.model b/ice_text.model
new file mode 100644
index 0000000000000000000000000000000000000000..0dcfe31e02ad0767e0c80a469340bf97f58e777a
--- /dev/null
+++ b/ice_text.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249
diff --git a/model_1/config.json b/model_1/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1e88038f22e000d97c0ccc84802e76dfef343f6
--- /dev/null
+++ b/model_1/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "chatglm-6b",
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
+  },
+  "bos_token_id": 130004,
+  "eos_token_id": 130005,
+  "gmask_token_id": 130001,
+  "hidden_size": 4096,
+  "inner_hidden_size": 16384,
+  "layernorm_epsilon": 1e-05,
+  "mask_token_id": 130000,
+  "max_sequence_length": 2048,
+  "model_type": "chatglm",
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "pad_token_id": 3,
+  "position_encoding_2d": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 4,
+  "torch_dtype": "float16",
+  "transformers_version": "4.27.1",
+  "use_cache": true,
+  "vocab_size": 130528
+}
diff --git a/model_1/configuration_chatglm.py b/model_1/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..78f3425d5f63ad43f31b092b8d62b44d28d52f15
--- /dev/null
+++ b/model_1/configuration_chatglm.py
@@ -0,0 +1,103 @@
+""" ChatGLM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class ChatGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
+    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150528):
+            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ChatGLMModel`] or
+            [`~TFChatGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        inner_hidden_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        max_sequence_length (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from configuration_chatglm import ChatGLMConfig
+    >>> from modeling_chatglm import ChatGLMModel
+
+    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
+    >>> configuration = ChatGLMConfig()
+
+    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
+    >>> model = ChatGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "chatglm"
+
+    def __init__(
+            self,
+            vocab_size=150528,
+            hidden_size=4096,
+            num_layers=28,
+            num_attention_heads=32,
+            layernorm_epsilon=1e-5,
+            use_cache=False,
+            bos_token_id=150004,
+            eos_token_id=150005,
+            mask_token_id=150000,
+            gmask_token_id=150001,
+            pad_token_id=0,
+            max_sequence_length=2048,
+            inner_hidden_size=16384,
+            position_encoding_2d=True,
+            quantization_bit=0,
+            pre_seq_len=None,
+            prefix_projection=False,
+            **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.layernorm_epsilon = layernorm_epsilon
+        self.inner_hidden_size = inner_hidden_size
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.gmask_token_id = gmask_token_id
+        self.position_encoding_2d = position_encoding_2d
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
diff --git a/model_1/generation_config.json b/model_1/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6191613b8cca2cd0d91cc92e90f2a353388ec3e
--- /dev/null
+++ b/model_1/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 130004,
+  "eos_token_id": 130005,
+  "pad_token_id": 3,
+  "transformers_version": "4.27.1"
+}
diff --git a/model_1/ice_text.model b/model_1/ice_text.model
new file mode 100644
index 0000000000000000000000000000000000000000..0dcfe31e02ad0767e0c80a469340bf97f58e777a
--- /dev/null
+++ b/model_1/ice_text.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249
diff --git a/model_1/modeling_chatglm.py b/model_1/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bef958fb33db5f65827ad44b1370656bd8d2f1b
--- /dev/null
+++ b/model_1/modeling_chatglm.py
@@ -0,0 +1,1435 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import os
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm-6b",
+    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
+]
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+                for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = inv_freq.half()
+        self.learnable = learnable
+        if learnable:
+            self.inv_freq = torch.nn.Parameter(inv_freq)
+            self.max_seq_len_cached = None
+        else:
+            self.register_buffer('inv_freq', inv_freq)
+            self.max_seq_len_cached = None
+            self.cos_cached = None
+            self.sin_cached = None
+        self.precision = precision
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        pass
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
+            self.max_seq_len_cached = None if self.learnable else seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+
+            # [sx, 1 (b * np), hn]
+            cos_cached = emb.cos()[:, None, :]
+            sin_cached = emb.sin()[:, None, :]
+            if self.precision == torch.bfloat16:
+                cos_cached = cos_cached.bfloat16()
+                sin_cached = sin_cached.bfloat16()
+            if self.learnable:
+                return cos_cached, sin_cached
+            self.cos_cached, self.sin_cached = cos_cached, sin_cached
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+
+    def _apply(self, fn):
+        if self.cos_cached is not None:
+            self.cos_cached = fn(self.cos_cached)
+        if self.sin_cached is not None:
+            self.sin_cached = fn(self.sin_cached)
+        return super()._apply(fn)
+
+
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
+
+
+@torch.jit.script
+def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
+    # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
+    cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), \
+        F.embedding(position_id, sin.squeeze(1)).unsqueeze(2)
+    q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+    return q, k
+
+
+def attention_fn(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        attention_mask,
+        hidden_size_per_partition,
+        layer_id,
+        layer_past=None,
+        scaling_attention_score=True,
+        use_cache=False,
+):
+    if layer_past is not None:
+        past_key, past_value = layer_past[0], layer_past[1]
+        key_layer = torch.cat((past_key, key_layer), dim=0)
+        value_layer = torch.cat((past_value, value_layer), dim=0)
+
+    # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
+    seq_len, b, nh, hidden_size = key_layer.shape
+
+    if use_cache:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+
+    query_key_layer_scaling_coeff = float(layer_id + 1)
+    if scaling_attention_score:
+        query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
+
+    # ===================================
+    # Raw attention scores. [b, np, s, s]
+    # ===================================
+
+    # [b, np, sq, sk]
+    output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+    # [sq, b, np, hn] -> [sq, b * np, hn]
+    query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+    # [sk, b, np, hn] -> [sk, b * np, hn]
+    key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+    matmul_result = torch.zeros(
+        1, 1, 1,
+        dtype=query_layer.dtype,
+        device=query_layer.device,
+    )
+
+    matmul_result = torch.baddbmm(
+        matmul_result,
+        query_layer.transpose(0, 1),  # [b * np, sq, hn]
+        key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+        beta=0.0,
+        alpha=1.0,
+    )
+
+    # change view to [b, np, sq, sk]
+    attention_scores = matmul_result.view(*output_size)
+
+    if self.scale_mask_softmax:
+        self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
+        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
+    else:
+        if not (attention_mask == 0).all():
+            # if auto-regressive, skip
+            attention_scores.masked_fill_(attention_mask, -10000.0)
+        dtype = attention_scores.dtype
+        attention_scores = attention_scores.float()
+        attention_scores = attention_scores * query_key_layer_scaling_coeff
+
+        attention_probs = F.softmax(attention_scores, dim=-1)
+
+        attention_probs = attention_probs.type(dtype)
+
+    # =========================
+    # Context layer. [sq, b, hp]
+    # =========================
+
+    # value_layer -> context layer.
+    # [sk, b, np, hn] --> [b, np, sq, hn]
+
+    # context layer shape: [b, np, sq, hn]
+    output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+
+    # change view [sk, b * np, hn]
+    value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+
+    # change view [b * np, sq, sk]
+    attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+    # matmul: [b * np, sq, hn]
+    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+    # change view [b, np, sq, hn]
+    context_layer = context_layer.view(*output_size)
+
+    # [b, np, sq, hn] --> [sq, b, np, hn]
+    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+    # [sq, b, np, hn] --> [sq, b, hp]
+    new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
+    context_layer = context_layer.view(*new_context_layer_shape)
+
+    outputs = (context_layer, present, attention_probs)
+
+    return outputs
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class SelfAttention(torch.nn.Module):
+    def __init__(self, hidden_size, num_attention_heads,
+                 layer_id, hidden_size_per_attention_head=None, bias=True,
+                 params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        super(SelfAttention, self).__init__()
+
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.hidden_size_per_partition = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_heads_per_partition = num_attention_heads
+        self.position_encoding_2d = position_encoding_2d
+        self.rotary_emb = RotaryEmbedding(
+            self.hidden_size // (self.num_attention_heads * 2)
+            if position_encoding_2d
+            else self.hidden_size // self.num_attention_heads,
+            base=10000,
+            precision=torch.half,
+            learnable=False,
+        )
+
+        self.scale_mask_softmax = None
+
+        if hidden_size_per_attention_head is None:
+            self.hidden_size_per_attention_head = hidden_size // num_attention_heads
+        else:
+            self.hidden_size_per_attention_head = hidden_size_per_attention_head
+
+        self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
+
+        # Strided linear layer.
+        self.query_key_value = init_method(
+            torch.nn.Linear,
+            hidden_size,
+            3 * self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+        self.dense = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    @staticmethod
+    def attention_mask_func(attention_scores, attention_mask):
+        attention_scores.masked_fill_(attention_mask, -10000.0)
+        return attention_scores
+
+    def split_tensor_along_last_dim(self, tensor, num_partitions,
+                                    contiguous_split_chunks=False):
+        """Split a tensor along its last dimension.
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                    in memory.
+        """
+        # Get the size and dimension.
+        last_dim = tensor.dim() - 1
+        last_dim_size = tensor.size()[last_dim] // num_partitions
+        # Split.
+        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+        # Note: torch.split does not create contiguous tensors by default.
+        if contiguous_split_chunks:
+            return tuple(chunk.contiguous() for chunk in tensor_list)
+
+        return tensor_list
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # [seq_len, batch, 3 * hidden_size]
+        mixed_raw_layer = self.query_key_value(hidden_states)
+
+        # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
+        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
+
+        # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+        (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3)
+
+        if self.position_encoding_2d:
+            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
+            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
+            cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
+            position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
+                position_ids[:, 1, :].transpose(0, 1).contiguous()
+            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
+            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
+            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
+            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
+        else:
+            position_ids = position_ids.transpose(0, 1)
+            cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
+            # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
+
+        # [seq_len, batch, hidden_size]
+        context_layer, present, attention_probs = attention_fn(
+            self=self,
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            hidden_size_per_partition=self.hidden_size_per_partition,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+
+        output = self.dense(context_layer)
+
+        outputs = (output, present)
+
+        if output_attentions:
+            outputs += (attention_probs,)
+
+        return outputs  # output, present, attention_probs
+
+
+class GEGLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.activation_fn = F.gelu
+
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+
+
+class GLU(torch.nn.Module):
+    def __init__(self, hidden_size, inner_hidden_size=None,
+                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
+        super(GLU, self).__init__()
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        self.layer_id = layer_id
+        self.activation_func = activation_func
+
+        # Project to 4h.
+        self.hidden_size = hidden_size
+        if inner_hidden_size is None:
+            inner_hidden_size = 4 * hidden_size
+        self.inner_hidden_size = inner_hidden_size
+        self.dense_h_to_4h = init_method(
+            torch.nn.Linear,
+            self.hidden_size,
+            self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+        # Project back to h.
+        self.dense_4h_to_h = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            self.hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    def forward(self, hidden_states):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        """
+
+        # [seq_len, batch, inner_hidden_size]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        output = self.dense_4h_to_h(intermediate_parallel)
+
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    def __init__(
+            self,
+            hidden_size,
+            num_attention_heads,
+            layernorm_epsilon,
+            layer_id,
+            inner_hidden_size=None,
+            hidden_size_per_attention_head=None,
+            layernorm=LayerNorm,
+            use_bias=True,
+            params_dtype=torch.float,
+            num_layers=28,
+            position_encoding_2d=True,
+            empty_init=True
+    ):
+        super(GLMBlock, self).__init__()
+        # Set output layer initialization if not provided.
+
+        self.layer_id = layer_id
+
+        # Layernorm on the input data.
+        self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.position_encoding_2d = position_encoding_2d
+
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            layer_id,
+            hidden_size_per_attention_head=hidden_size_per_attention_head,
+            bias=use_bias,
+            params_dtype=params_dtype,
+            position_encoding_2d=self.position_encoding_2d,
+            empty_init=empty_init
+        )
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.num_layers = num_layers
+
+        # GLU
+        self.mlp = GLU(
+            hidden_size,
+            inner_hidden_size=inner_hidden_size,
+            bias=use_bias,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            empty_init=empty_init
+        )
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # Layer norm at the begining of the transformer layer.
+        # [seq_len, batch, hidden_size]
+        attention_input = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_outputs = self.attention(
+            attention_input,
+            position_ids,
+            attention_mask=attention_mask,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        outputs = attention_outputs[1:]
+
+        # Residual connection.
+        alpha = (2 * self.num_layers) ** 0.5
+        hidden_states = attention_input * alpha + attention_output
+
+        mlp_input = self.post_attention_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output = self.mlp(mlp_input)
+
+        # Second residual connection.
+        output = mlp_input * alpha + mlp_output
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
+        attention_mask.tril_()
+        for i, context_length in enumerate(context_lengths):
+            attention_mask[i, :, :context_length] = 1
+        attention_mask.unsqueeze_(1)
+        attention_mask = (attention_mask < 0.5).bool()
+
+        return attention_mask
+
+    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
+        batch_size, seq_length = input_ids.shape
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        if self.position_encoding_2d:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                position_ids[i, context_length:] = mask_positions[i]
+            block_position_ids = [torch.cat((
+                torch.zeros(context_length, dtype=torch.long, device=device),
+                torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
+            )) for context_length in context_lengths]
+            block_position_ids = torch.stack(block_position_ids, dim=0)
+            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
+        else:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
+                    position_ids[i, context_length:] = mask_positions[i]
+
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ChatGLMModel):
+            module.gradient_checkpointing = value
+
+
+CHATGLM_6B_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CHATGLM_6B_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ChatGLM6BTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.",
+    CHATGLM_6B_START_DOCSTRING,
+)
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        # recording parameters
+        self.max_sequence_length = config.max_sequence_length
+        self.hidden_size = config.hidden_size
+        self.params_dtype = torch.half
+        self.num_attention_heads = config.num_attention_heads
+        self.vocab_size = config.vocab_size
+        self.num_layers = config.num_layers
+        self.layernorm_epsilon = config.layernorm_epsilon
+        self.inner_hidden_size = config.inner_hidden_size
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        self.position_encoding_2d = config.position_encoding_2d
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+
+        self.word_embeddings = init_method(
+            torch.nn.Embedding,
+            num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
+            dtype=self.params_dtype
+        )
+        self.gradient_checkpointing = False
+
+        def get_layer(layer_id):
+            return GLMBlock(
+                self.hidden_size,
+                self.num_attention_heads,
+                self.layernorm_epsilon,
+                layer_id,
+                inner_hidden_size=self.inner_hidden_size,
+                hidden_size_per_attention_head=self.hidden_size_per_attention_head,
+                layernorm=LayerNorm,
+                use_bias=True,
+                params_dtype=self.params_dtype,
+                position_encoding_2d=self.position_encoding_2d,
+                empty_init=empty_init
+            )
+
+        self.layers = torch.nn.ModuleList(
+            [get_layer(layer_id) for layer_id in range(self.num_layers)]
+        )
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
+
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+            # total_params = sum(p.numel() for p in self.parameters())
+            # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+            # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params))
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.num_attention_heads,
+            self.hidden_size // self.num_attention_heads
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        # past_key_values = [(v[0], v[1]) for v in past_key_values]
+        return past_key_values
+
+    @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if past_key_values is None:
+            if self.pre_seq_len is not None:
+                past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            else:
+                past_key_values = tuple([None] * len(self.layers))
+
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+
+
+            if position_ids is None:
+                MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+                seqs = input_ids.tolist()
+
+                mask_positions, use_gmasks = [], []
+                for seq in seqs:
+                    mask_token = gMASK if gMASK in seq else MASK
+                    use_gmask = mask_token == gMASK
+                    mask_positions.append(seq.index(mask_token))
+                    use_gmasks.append(use_gmask)
+
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    mask_positions=mask_positions,
+                    device=input_ids.device,
+                    use_gmasks=use_gmasks
+                )
+
+        if self.pre_seq_len is not None and attention_mask is not None:
+            prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
+                attention_mask.device)
+            prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
+
+        # [seq_len, batch, hidden_size]
+        hidden_states = inputs_embeds.transpose(0, 1)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if attention_mask is None:
+            attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        for i, layer in enumerate(self.layers):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_past = past_key_values[i]
+
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    torch.tensor(i),
+                    layer_past,
+                    use_cache,
+                    output_attentions
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    layer_id=torch.tensor(i),
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions
+                )
+
+            hidden_states = layer_ret[0]
+
+            if use_cache:
+                presents = presents + (layer_ret[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],)
+
+        # Final layer norm.
+        hidden_states = self.final_layernorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+
+        # self.hidden_size = config.hidden_size
+        # self.params_dtype = torch.half
+        # self.vocab_size = config.vocab_size
+        self.max_sequence_length = config.max_sequence_length
+
+        self.position_encoding_2d = config.position_encoding_2d
+
+        self.transformer = ChatGLMModel(config, empty_init=empty_init)
+
+        self.lm_head = init_method(
+            nn.Linear,
+            config.hidden_size,
+            config.vocab_size,
+            bias=False,
+            dtype=torch.half
+        )
+
+        self.config = config
+
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3)
+                new_attention_mask = attention_mask[:, :, -1:].clone()
+                new_attention_mask[..., -1] = False
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, new_attention_mask], dim=2
+                )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id[:, 1, :] += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past: Optional[torch.Tensor] = None,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            **kwargs
+    ) -> dict:
+        batch_size, seq_length = input_ids.shape
+        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+        seqs = input_ids.tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
+
+        # only last token for input_ids if past is not None
+        if past is not None or past_key_values is not None:
+            last_token = input_ids[:, -1].unsqueeze(-1)
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = attention_mask[:, :, -1:]
+            else:
+                attention_mask = None
+            if position_ids is not None:
+                position_ids = position_ids[..., -1:]
+            else:
+                context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
+                if self.position_encoding_2d:
+                    position_ids = torch.tensor(
+                        [[mask_position, seq_length - context_length] for mask_position, context_length in
+                         zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
+                else:
+                    position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
+                                                device=input_ids.device).unsqueeze(-1)
+
+            if past is None:
+                past = past_key_values
+            return {
+                "input_ids": last_token,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+        else:
+            if attention_mask is not None and attention_mask.dtype != torch.bool:
+                logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
+                attention_mask = None
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+            if position_ids is None:
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    device=input_ids.device,
+                    mask_positions=mask_positions,
+                    use_gmasks=use_gmasks
+                )
+
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
+
+    @torch.no_grad()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
+             do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+
+    @torch.no_grad()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
+                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        for outputs in self.stream_generate(**inputs, **gen_kwargs):
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            response = self.process_response(response)
+            new_history = history + [(query, response)]
+            yield response, new_history
+
+    @torch.no_grad()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+            yield input_ids
+
+    def quantize(self, bits: int, empty_init=False, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
+        return self
diff --git a/model_1/pytorch_model.bin b/model_1/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8353095005e5283f039c0f1d8ab9402a82bbd556
--- /dev/null
+++ b/model_1/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7454b5402bc3c7eea4ed1a3eaf32e7e7a835afc834c67188a387fb00994bb35
+size 117441341
diff --git a/model_1/quantization.py b/model_1/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f469f6a25a8233fe881608168daeba0bc809540
--- /dev/null
+++ b/model_1/quantization.py
@@ -0,0 +1,201 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    if source_bit_width == 8:
+        func = kernels.int8WeightExtractionHalf
+    elif source_bit_width == 4:
+        func = kernels.int4WeightExtractionHalf
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(Linear):
+    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs):
+        super(QuantizedLinear, self).__init__(*args, **kwargs)
+        self.weight_bit_width = weight_bit_width
+
+        shape = self.weight.shape
+        del self.weight
+
+        if weight_tensor is None or empty_init:
+            self.weight = torch.empty(
+                shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
+            )
+            self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
+        else:
+            self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+            self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
+        if bias_tensor is not None:
+            self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False)
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, **kwargs):
+    """Replace fp16 linear with quantized linear"""
+
+    for layer in model.layers:
+        layer.attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.query_key_value.bias,
+            in_features=layer.attention.query_key_value.in_features,
+            out_features=layer.attention.query_key_value.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.query_key_value.weight.device,
+            empty_init=empty_init
+        )
+        layer.attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.dense.bias,
+            in_features=layer.attention.dense.in_features,
+            out_features=layer.attention.dense.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.dense.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_h_to_4h.bias,
+            in_features=layer.mlp.dense_h_to_4h.in_features,
+            out_features=layer.mlp.dense_h_to_4h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_h_to_4h.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_4h_to_h.bias,
+            in_features=layer.mlp.dense_4h_to_h.in_features,
+            out_features=layer.mlp.dense_4h_to_h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_4h_to_h.weight.device,
+            empty_init=empty_init
+        )
+    return model
diff --git a/model_1/rng_state.pth b/model_1/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..15d1f84440568c08a8a2591eb68a45fcc586607d
--- /dev/null
+++ b/model_1/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23863c8e7c35e271159f50a9500a14ac30ef072497ccfeddbd8ca0bc52d2fbb7
+size 14575
diff --git a/model_1/scheduler.pt b/model_1/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4e75eaff981543536e1a75045acfd7d21048c6a4
--- /dev/null
+++ b/model_1/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12f5a9ce3eab00e925e71149d3ef546598192dd352c87b5a13107f4b58027b27
+size 627
diff --git a/model_1/special_tokens_map.json b/model_1/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f897c919b758e64c56eb1a7b34b39b569040086
--- /dev/null
+++ b/model_1/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "bos_token": "<sop>",
+  "eos_token": "<eop>",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}
diff --git a/model_1/tokenization_chatglm.py b/model_1/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f0ba532543b6dbdacdd83d30324b7a6abfad3
--- /dev/null
+++ b/model_1/tokenization_chatglm.py
@@ -0,0 +1,430 @@
+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+
+
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+    def __len__(self):
+        return self.num_tokens
+
+
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            pad_token="<pad>",
+            unk_token="<unk>",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+
+        """ Initialisation """
+
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+
+        seq = self.sp_tokenizer.tokenize(text)
+
+        return seq
+
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return self.sp_tokenizer.decode(token_ids)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        gmask_id = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+
+            if "position_ids" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/model_1/tokenizer_config.json b/model_1/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8e1c935cc40c270ff6ac75c05b4208533688a
--- /dev/null
+++ b/model_1/tokenizer_config.json
@@ -0,0 +1,22 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<sop>",
+  "do_lower_case": false,
+  "end_token": "</s>",
+  "eos_token": "<eop>",
+  "gmask_token": "[gMASK]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "num_image_tokens": 0,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "remove_space": false,
+  "special_tokens_map_file": null,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "unk_token": "<unk>"
+}
diff --git a/model_1/trainer_state.json b/model_1/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..728a7496ef1820821871133b77ed9778084df408
--- /dev/null
+++ b/model_1/trainer_state.json
@@ -0,0 +1,136 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 45.714285714285715,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 2.29,
+      "learning_rate": 0.019933333333333334,
+      "loss": 1.6174,
+      "step": 10
+    },
+    {
+      "epoch": 4.57,
+      "learning_rate": 0.019866666666666668,
+      "loss": 0.3005,
+      "step": 20
+    },
+    {
+      "epoch": 6.86,
+      "learning_rate": 0.0198,
+      "loss": 0.0988,
+      "step": 30
+    },
+    {
+      "epoch": 9.14,
+      "learning_rate": 0.019733333333333335,
+      "loss": 0.0462,
+      "step": 40
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.019666666666666666,
+      "loss": 0.0193,
+      "step": 50
+    },
+    {
+      "epoch": 13.71,
+      "learning_rate": 0.0196,
+      "loss": 0.0096,
+      "step": 60
+    },
+    {
+      "epoch": 16.0,
+      "learning_rate": 0.019533333333333333,
+      "loss": 0.0052,
+      "step": 70
+    },
+    {
+      "epoch": 18.29,
+      "learning_rate": 0.019466666666666667,
+      "loss": 0.0037,
+      "step": 80
+    },
+    {
+      "epoch": 20.57,
+      "learning_rate": 0.0194,
+      "loss": 0.0028,
+      "step": 90
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.019333333333333334,
+      "loss": 0.0019,
+      "step": 100
+    },
+    {
+      "epoch": 25.14,
+      "learning_rate": 0.019266666666666668,
+      "loss": 0.0029,
+      "step": 110
+    },
+    {
+      "epoch": 27.43,
+      "learning_rate": 0.0192,
+      "loss": 0.002,
+      "step": 120
+    },
+    {
+      "epoch": 29.71,
+      "learning_rate": 0.019133333333333332,
+      "loss": 0.0016,
+      "step": 130
+    },
+    {
+      "epoch": 32.0,
+      "learning_rate": 0.01906666666666667,
+      "loss": 0.002,
+      "step": 140
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.019,
+      "loss": 0.0008,
+      "step": 150
+    },
+    {
+      "epoch": 36.57,
+      "learning_rate": 0.018933333333333333,
+      "loss": 0.0008,
+      "step": 160
+    },
+    {
+      "epoch": 38.86,
+      "learning_rate": 0.018866666666666667,
+      "loss": 0.0007,
+      "step": 170
+    },
+    {
+      "epoch": 41.14,
+      "learning_rate": 0.0188,
+      "loss": 0.0006,
+      "step": 180
+    },
+    {
+      "epoch": 43.43,
+      "learning_rate": 0.018733333333333334,
+      "loss": 0.0005,
+      "step": 190
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.018666666666666668,
+      "loss": 0.0005,
+      "step": 200
+    }
+  ],
+  "max_steps": 3000,
+  "num_train_epochs": 750,
+  "total_flos": 1.05080467488768e+16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/model_1/training_args.bin b/model_1/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c7dfebf081440a9a6595dc13249bc141f78c0154
--- /dev/null
+++ b/model_1/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:777a01801032451e89d19d85ab197e2c8a9843464e14bbbc3856d5c6a049f1a6
+size 3707
diff --git a/model_2/config.json b/model_2/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1e88038f22e000d97c0ccc84802e76dfef343f6
--- /dev/null
+++ b/model_2/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "chatglm-6b",
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
+  },
+  "bos_token_id": 130004,
+  "eos_token_id": 130005,
+  "gmask_token_id": 130001,
+  "hidden_size": 4096,
+  "inner_hidden_size": 16384,
+  "layernorm_epsilon": 1e-05,
+  "mask_token_id": 130000,
+  "max_sequence_length": 2048,
+  "model_type": "chatglm",
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "pad_token_id": 3,
+  "position_encoding_2d": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 4,
+  "torch_dtype": "float16",
+  "transformers_version": "4.27.1",
+  "use_cache": true,
+  "vocab_size": 130528
+}
diff --git a/model_2/configuration_chatglm.py b/model_2/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..78f3425d5f63ad43f31b092b8d62b44d28d52f15
--- /dev/null
+++ b/model_2/configuration_chatglm.py
@@ -0,0 +1,103 @@
+""" ChatGLM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class ChatGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
+    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150528):
+            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ChatGLMModel`] or
+            [`~TFChatGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        inner_hidden_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        max_sequence_length (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from configuration_chatglm import ChatGLMConfig
+    >>> from modeling_chatglm import ChatGLMModel
+
+    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
+    >>> configuration = ChatGLMConfig()
+
+    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
+    >>> model = ChatGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "chatglm"
+
+    def __init__(
+            self,
+            vocab_size=150528,
+            hidden_size=4096,
+            num_layers=28,
+            num_attention_heads=32,
+            layernorm_epsilon=1e-5,
+            use_cache=False,
+            bos_token_id=150004,
+            eos_token_id=150005,
+            mask_token_id=150000,
+            gmask_token_id=150001,
+            pad_token_id=0,
+            max_sequence_length=2048,
+            inner_hidden_size=16384,
+            position_encoding_2d=True,
+            quantization_bit=0,
+            pre_seq_len=None,
+            prefix_projection=False,
+            **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.layernorm_epsilon = layernorm_epsilon
+        self.inner_hidden_size = inner_hidden_size
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.gmask_token_id = gmask_token_id
+        self.position_encoding_2d = position_encoding_2d
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
diff --git a/model_2/generation_config.json b/model_2/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6191613b8cca2cd0d91cc92e90f2a353388ec3e
--- /dev/null
+++ b/model_2/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 130004,
+  "eos_token_id": 130005,
+  "pad_token_id": 3,
+  "transformers_version": "4.27.1"
+}
diff --git a/model_2/ice_text.model b/model_2/ice_text.model
new file mode 100644
index 0000000000000000000000000000000000000000..0dcfe31e02ad0767e0c80a469340bf97f58e777a
--- /dev/null
+++ b/model_2/ice_text.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249
diff --git a/model_2/modeling_chatglm.py b/model_2/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bef958fb33db5f65827ad44b1370656bd8d2f1b
--- /dev/null
+++ b/model_2/modeling_chatglm.py
@@ -0,0 +1,1435 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import os
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm-6b",
+    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
+]
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+                for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = inv_freq.half()
+        self.learnable = learnable
+        if learnable:
+            self.inv_freq = torch.nn.Parameter(inv_freq)
+            self.max_seq_len_cached = None
+        else:
+            self.register_buffer('inv_freq', inv_freq)
+            self.max_seq_len_cached = None
+            self.cos_cached = None
+            self.sin_cached = None
+        self.precision = precision
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        pass
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
+            self.max_seq_len_cached = None if self.learnable else seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+
+            # [sx, 1 (b * np), hn]
+            cos_cached = emb.cos()[:, None, :]
+            sin_cached = emb.sin()[:, None, :]
+            if self.precision == torch.bfloat16:
+                cos_cached = cos_cached.bfloat16()
+                sin_cached = sin_cached.bfloat16()
+            if self.learnable:
+                return cos_cached, sin_cached
+            self.cos_cached, self.sin_cached = cos_cached, sin_cached
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+
+    def _apply(self, fn):
+        if self.cos_cached is not None:
+            self.cos_cached = fn(self.cos_cached)
+        if self.sin_cached is not None:
+            self.sin_cached = fn(self.sin_cached)
+        return super()._apply(fn)
+
+
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
+
+
+@torch.jit.script
+def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
+    # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
+    cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), \
+        F.embedding(position_id, sin.squeeze(1)).unsqueeze(2)
+    q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+    return q, k
+
+
+def attention_fn(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        attention_mask,
+        hidden_size_per_partition,
+        layer_id,
+        layer_past=None,
+        scaling_attention_score=True,
+        use_cache=False,
+):
+    if layer_past is not None:
+        past_key, past_value = layer_past[0], layer_past[1]
+        key_layer = torch.cat((past_key, key_layer), dim=0)
+        value_layer = torch.cat((past_value, value_layer), dim=0)
+
+    # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
+    seq_len, b, nh, hidden_size = key_layer.shape
+
+    if use_cache:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+
+    query_key_layer_scaling_coeff = float(layer_id + 1)
+    if scaling_attention_score:
+        query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
+
+    # ===================================
+    # Raw attention scores. [b, np, s, s]
+    # ===================================
+
+    # [b, np, sq, sk]
+    output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+    # [sq, b, np, hn] -> [sq, b * np, hn]
+    query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+    # [sk, b, np, hn] -> [sk, b * np, hn]
+    key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+    matmul_result = torch.zeros(
+        1, 1, 1,
+        dtype=query_layer.dtype,
+        device=query_layer.device,
+    )
+
+    matmul_result = torch.baddbmm(
+        matmul_result,
+        query_layer.transpose(0, 1),  # [b * np, sq, hn]
+        key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+        beta=0.0,
+        alpha=1.0,
+    )
+
+    # change view to [b, np, sq, sk]
+    attention_scores = matmul_result.view(*output_size)
+
+    if self.scale_mask_softmax:
+        self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
+        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
+    else:
+        if not (attention_mask == 0).all():
+            # if auto-regressive, skip
+            attention_scores.masked_fill_(attention_mask, -10000.0)
+        dtype = attention_scores.dtype
+        attention_scores = attention_scores.float()
+        attention_scores = attention_scores * query_key_layer_scaling_coeff
+
+        attention_probs = F.softmax(attention_scores, dim=-1)
+
+        attention_probs = attention_probs.type(dtype)
+
+    # =========================
+    # Context layer. [sq, b, hp]
+    # =========================
+
+    # value_layer -> context layer.
+    # [sk, b, np, hn] --> [b, np, sq, hn]
+
+    # context layer shape: [b, np, sq, hn]
+    output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+
+    # change view [sk, b * np, hn]
+    value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+
+    # change view [b * np, sq, sk]
+    attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+    # matmul: [b * np, sq, hn]
+    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+    # change view [b, np, sq, hn]
+    context_layer = context_layer.view(*output_size)
+
+    # [b, np, sq, hn] --> [sq, b, np, hn]
+    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+    # [sq, b, np, hn] --> [sq, b, hp]
+    new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
+    context_layer = context_layer.view(*new_context_layer_shape)
+
+    outputs = (context_layer, present, attention_probs)
+
+    return outputs
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class SelfAttention(torch.nn.Module):
+    def __init__(self, hidden_size, num_attention_heads,
+                 layer_id, hidden_size_per_attention_head=None, bias=True,
+                 params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        super(SelfAttention, self).__init__()
+
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.hidden_size_per_partition = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_heads_per_partition = num_attention_heads
+        self.position_encoding_2d = position_encoding_2d
+        self.rotary_emb = RotaryEmbedding(
+            self.hidden_size // (self.num_attention_heads * 2)
+            if position_encoding_2d
+            else self.hidden_size // self.num_attention_heads,
+            base=10000,
+            precision=torch.half,
+            learnable=False,
+        )
+
+        self.scale_mask_softmax = None
+
+        if hidden_size_per_attention_head is None:
+            self.hidden_size_per_attention_head = hidden_size // num_attention_heads
+        else:
+            self.hidden_size_per_attention_head = hidden_size_per_attention_head
+
+        self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
+
+        # Strided linear layer.
+        self.query_key_value = init_method(
+            torch.nn.Linear,
+            hidden_size,
+            3 * self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+        self.dense = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    @staticmethod
+    def attention_mask_func(attention_scores, attention_mask):
+        attention_scores.masked_fill_(attention_mask, -10000.0)
+        return attention_scores
+
+    def split_tensor_along_last_dim(self, tensor, num_partitions,
+                                    contiguous_split_chunks=False):
+        """Split a tensor along its last dimension.
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                    in memory.
+        """
+        # Get the size and dimension.
+        last_dim = tensor.dim() - 1
+        last_dim_size = tensor.size()[last_dim] // num_partitions
+        # Split.
+        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+        # Note: torch.split does not create contiguous tensors by default.
+        if contiguous_split_chunks:
+            return tuple(chunk.contiguous() for chunk in tensor_list)
+
+        return tensor_list
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # [seq_len, batch, 3 * hidden_size]
+        mixed_raw_layer = self.query_key_value(hidden_states)
+
+        # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
+        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
+
+        # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+        (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3)
+
+        if self.position_encoding_2d:
+            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
+            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
+            cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
+            position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
+                position_ids[:, 1, :].transpose(0, 1).contiguous()
+            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
+            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
+            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
+            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
+        else:
+            position_ids = position_ids.transpose(0, 1)
+            cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
+            # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
+
+        # [seq_len, batch, hidden_size]
+        context_layer, present, attention_probs = attention_fn(
+            self=self,
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            hidden_size_per_partition=self.hidden_size_per_partition,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+
+        output = self.dense(context_layer)
+
+        outputs = (output, present)
+
+        if output_attentions:
+            outputs += (attention_probs,)
+
+        return outputs  # output, present, attention_probs
+
+
+class GEGLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.activation_fn = F.gelu
+
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+
+
+class GLU(torch.nn.Module):
+    def __init__(self, hidden_size, inner_hidden_size=None,
+                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
+        super(GLU, self).__init__()
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        self.layer_id = layer_id
+        self.activation_func = activation_func
+
+        # Project to 4h.
+        self.hidden_size = hidden_size
+        if inner_hidden_size is None:
+            inner_hidden_size = 4 * hidden_size
+        self.inner_hidden_size = inner_hidden_size
+        self.dense_h_to_4h = init_method(
+            torch.nn.Linear,
+            self.hidden_size,
+            self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+        # Project back to h.
+        self.dense_4h_to_h = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            self.hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    def forward(self, hidden_states):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        """
+
+        # [seq_len, batch, inner_hidden_size]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        output = self.dense_4h_to_h(intermediate_parallel)
+
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    def __init__(
+            self,
+            hidden_size,
+            num_attention_heads,
+            layernorm_epsilon,
+            layer_id,
+            inner_hidden_size=None,
+            hidden_size_per_attention_head=None,
+            layernorm=LayerNorm,
+            use_bias=True,
+            params_dtype=torch.float,
+            num_layers=28,
+            position_encoding_2d=True,
+            empty_init=True
+    ):
+        super(GLMBlock, self).__init__()
+        # Set output layer initialization if not provided.
+
+        self.layer_id = layer_id
+
+        # Layernorm on the input data.
+        self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.position_encoding_2d = position_encoding_2d
+
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            layer_id,
+            hidden_size_per_attention_head=hidden_size_per_attention_head,
+            bias=use_bias,
+            params_dtype=params_dtype,
+            position_encoding_2d=self.position_encoding_2d,
+            empty_init=empty_init
+        )
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.num_layers = num_layers
+
+        # GLU
+        self.mlp = GLU(
+            hidden_size,
+            inner_hidden_size=inner_hidden_size,
+            bias=use_bias,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            empty_init=empty_init
+        )
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # Layer norm at the begining of the transformer layer.
+        # [seq_len, batch, hidden_size]
+        attention_input = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_outputs = self.attention(
+            attention_input,
+            position_ids,
+            attention_mask=attention_mask,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        outputs = attention_outputs[1:]
+
+        # Residual connection.
+        alpha = (2 * self.num_layers) ** 0.5
+        hidden_states = attention_input * alpha + attention_output
+
+        mlp_input = self.post_attention_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output = self.mlp(mlp_input)
+
+        # Second residual connection.
+        output = mlp_input * alpha + mlp_output
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
+        attention_mask.tril_()
+        for i, context_length in enumerate(context_lengths):
+            attention_mask[i, :, :context_length] = 1
+        attention_mask.unsqueeze_(1)
+        attention_mask = (attention_mask < 0.5).bool()
+
+        return attention_mask
+
+    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
+        batch_size, seq_length = input_ids.shape
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        if self.position_encoding_2d:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                position_ids[i, context_length:] = mask_positions[i]
+            block_position_ids = [torch.cat((
+                torch.zeros(context_length, dtype=torch.long, device=device),
+                torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
+            )) for context_length in context_lengths]
+            block_position_ids = torch.stack(block_position_ids, dim=0)
+            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
+        else:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
+                    position_ids[i, context_length:] = mask_positions[i]
+
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ChatGLMModel):
+            module.gradient_checkpointing = value
+
+
+CHATGLM_6B_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CHATGLM_6B_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ChatGLM6BTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.",
+    CHATGLM_6B_START_DOCSTRING,
+)
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        # recording parameters
+        self.max_sequence_length = config.max_sequence_length
+        self.hidden_size = config.hidden_size
+        self.params_dtype = torch.half
+        self.num_attention_heads = config.num_attention_heads
+        self.vocab_size = config.vocab_size
+        self.num_layers = config.num_layers
+        self.layernorm_epsilon = config.layernorm_epsilon
+        self.inner_hidden_size = config.inner_hidden_size
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        self.position_encoding_2d = config.position_encoding_2d
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+
+        self.word_embeddings = init_method(
+            torch.nn.Embedding,
+            num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
+            dtype=self.params_dtype
+        )
+        self.gradient_checkpointing = False
+
+        def get_layer(layer_id):
+            return GLMBlock(
+                self.hidden_size,
+                self.num_attention_heads,
+                self.layernorm_epsilon,
+                layer_id,
+                inner_hidden_size=self.inner_hidden_size,
+                hidden_size_per_attention_head=self.hidden_size_per_attention_head,
+                layernorm=LayerNorm,
+                use_bias=True,
+                params_dtype=self.params_dtype,
+                position_encoding_2d=self.position_encoding_2d,
+                empty_init=empty_init
+            )
+
+        self.layers = torch.nn.ModuleList(
+            [get_layer(layer_id) for layer_id in range(self.num_layers)]
+        )
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
+
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+            # total_params = sum(p.numel() for p in self.parameters())
+            # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+            # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params))
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.num_attention_heads,
+            self.hidden_size // self.num_attention_heads
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        # past_key_values = [(v[0], v[1]) for v in past_key_values]
+        return past_key_values
+
+    @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if past_key_values is None:
+            if self.pre_seq_len is not None:
+                past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            else:
+                past_key_values = tuple([None] * len(self.layers))
+
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+
+
+            if position_ids is None:
+                MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+                seqs = input_ids.tolist()
+
+                mask_positions, use_gmasks = [], []
+                for seq in seqs:
+                    mask_token = gMASK if gMASK in seq else MASK
+                    use_gmask = mask_token == gMASK
+                    mask_positions.append(seq.index(mask_token))
+                    use_gmasks.append(use_gmask)
+
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    mask_positions=mask_positions,
+                    device=input_ids.device,
+                    use_gmasks=use_gmasks
+                )
+
+        if self.pre_seq_len is not None and attention_mask is not None:
+            prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
+                attention_mask.device)
+            prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
+
+        # [seq_len, batch, hidden_size]
+        hidden_states = inputs_embeds.transpose(0, 1)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if attention_mask is None:
+            attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        for i, layer in enumerate(self.layers):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_past = past_key_values[i]
+
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    torch.tensor(i),
+                    layer_past,
+                    use_cache,
+                    output_attentions
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    layer_id=torch.tensor(i),
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions
+                )
+
+            hidden_states = layer_ret[0]
+
+            if use_cache:
+                presents = presents + (layer_ret[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],)
+
+        # Final layer norm.
+        hidden_states = self.final_layernorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+
+        # self.hidden_size = config.hidden_size
+        # self.params_dtype = torch.half
+        # self.vocab_size = config.vocab_size
+        self.max_sequence_length = config.max_sequence_length
+
+        self.position_encoding_2d = config.position_encoding_2d
+
+        self.transformer = ChatGLMModel(config, empty_init=empty_init)
+
+        self.lm_head = init_method(
+            nn.Linear,
+            config.hidden_size,
+            config.vocab_size,
+            bias=False,
+            dtype=torch.half
+        )
+
+        self.config = config
+
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3)
+                new_attention_mask = attention_mask[:, :, -1:].clone()
+                new_attention_mask[..., -1] = False
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, new_attention_mask], dim=2
+                )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id[:, 1, :] += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past: Optional[torch.Tensor] = None,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            **kwargs
+    ) -> dict:
+        batch_size, seq_length = input_ids.shape
+        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+        seqs = input_ids.tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
+
+        # only last token for input_ids if past is not None
+        if past is not None or past_key_values is not None:
+            last_token = input_ids[:, -1].unsqueeze(-1)
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = attention_mask[:, :, -1:]
+            else:
+                attention_mask = None
+            if position_ids is not None:
+                position_ids = position_ids[..., -1:]
+            else:
+                context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
+                if self.position_encoding_2d:
+                    position_ids = torch.tensor(
+                        [[mask_position, seq_length - context_length] for mask_position, context_length in
+                         zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
+                else:
+                    position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
+                                                device=input_ids.device).unsqueeze(-1)
+
+            if past is None:
+                past = past_key_values
+            return {
+                "input_ids": last_token,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+        else:
+            if attention_mask is not None and attention_mask.dtype != torch.bool:
+                logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
+                attention_mask = None
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+            if position_ids is None:
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    device=input_ids.device,
+                    mask_positions=mask_positions,
+                    use_gmasks=use_gmasks
+                )
+
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
+
+    @torch.no_grad()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
+             do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+
+    @torch.no_grad()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
+                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        for outputs in self.stream_generate(**inputs, **gen_kwargs):
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            response = self.process_response(response)
+            new_history = history + [(query, response)]
+            yield response, new_history
+
+    @torch.no_grad()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+            yield input_ids
+
+    def quantize(self, bits: int, empty_init=False, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
+        return self
diff --git a/model_2/pytorch_model.bin b/model_2/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..626a1afab82c1a5e5434124af95947a6e894b0c3
--- /dev/null
+++ b/model_2/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ae4b23eeb186ad78a4a2af5ea05f67cd074e0750696916dd14b5f6ef68a521
+size 117441341
diff --git a/model_2/quantization.py b/model_2/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f469f6a25a8233fe881608168daeba0bc809540
--- /dev/null
+++ b/model_2/quantization.py
@@ -0,0 +1,201 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    if source_bit_width == 8:
+        func = kernels.int8WeightExtractionHalf
+    elif source_bit_width == 4:
+        func = kernels.int4WeightExtractionHalf
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(Linear):
+    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs):
+        super(QuantizedLinear, self).__init__(*args, **kwargs)
+        self.weight_bit_width = weight_bit_width
+
+        shape = self.weight.shape
+        del self.weight
+
+        if weight_tensor is None or empty_init:
+            self.weight = torch.empty(
+                shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
+            )
+            self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
+        else:
+            self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+            self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
+        if bias_tensor is not None:
+            self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False)
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, **kwargs):
+    """Replace fp16 linear with quantized linear"""
+
+    for layer in model.layers:
+        layer.attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.query_key_value.bias,
+            in_features=layer.attention.query_key_value.in_features,
+            out_features=layer.attention.query_key_value.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.query_key_value.weight.device,
+            empty_init=empty_init
+        )
+        layer.attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.dense.bias,
+            in_features=layer.attention.dense.in_features,
+            out_features=layer.attention.dense.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.dense.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_h_to_4h.bias,
+            in_features=layer.mlp.dense_h_to_4h.in_features,
+            out_features=layer.mlp.dense_h_to_4h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_h_to_4h.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_4h_to_h.bias,
+            in_features=layer.mlp.dense_4h_to_h.in_features,
+            out_features=layer.mlp.dense_4h_to_h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_4h_to_h.weight.device,
+            empty_init=empty_init
+        )
+    return model
diff --git a/model_2/rng_state.pth b/model_2/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1f67278504202fe62600920844e0fc451ce17a2d
--- /dev/null
+++ b/model_2/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18ba03ebdfcb4a727622518507288a616afd143124c792ac0228534fb883d9a4
+size 14575
diff --git a/model_2/scheduler.pt b/model_2/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ac00758b9a37603ea42d87e34c46ab0a172112b
--- /dev/null
+++ b/model_2/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d2b2ce76eead8b52a95c7fef146bc8312dd69f9e4a2b748d2ffe7770457ad11
+size 627
diff --git a/model_2/special_tokens_map.json b/model_2/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f897c919b758e64c56eb1a7b34b39b569040086
--- /dev/null
+++ b/model_2/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "bos_token": "<sop>",
+  "eos_token": "<eop>",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}
diff --git a/model_2/tokenization_chatglm.py b/model_2/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f0ba532543b6dbdacdd83d30324b7a6abfad3
--- /dev/null
+++ b/model_2/tokenization_chatglm.py
@@ -0,0 +1,430 @@
+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+
+
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+    def __len__(self):
+        return self.num_tokens
+
+
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            pad_token="<pad>",
+            unk_token="<unk>",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+
+        """ Initialisation """
+
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+
+        seq = self.sp_tokenizer.tokenize(text)
+
+        return seq
+
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return self.sp_tokenizer.decode(token_ids)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        gmask_id = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+
+            if "position_ids" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/model_2/tokenizer_config.json b/model_2/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8e1c935cc40c270ff6ac75c05b4208533688a
--- /dev/null
+++ b/model_2/tokenizer_config.json
@@ -0,0 +1,22 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<sop>",
+  "do_lower_case": false,
+  "end_token": "</s>",
+  "eos_token": "<eop>",
+  "gmask_token": "[gMASK]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "num_image_tokens": 0,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "remove_space": false,
+  "special_tokens_map_file": null,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "unk_token": "<unk>"
+}
diff --git a/model_2/trainer_state.json b/model_2/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f799e4aa5ed8f5ebc04236e92fc32de209be2
--- /dev/null
+++ b/model_2/trainer_state.json
@@ -0,0 +1,196 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 63.1578947368421,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 2.11,
+      "learning_rate": 0.019333333333333334,
+      "loss": 1.1331,
+      "step": 10
+    },
+    {
+      "epoch": 4.21,
+      "learning_rate": 0.018666666666666668,
+      "loss": 0.2373,
+      "step": 20
+    },
+    {
+      "epoch": 6.32,
+      "learning_rate": 0.018000000000000002,
+      "loss": 0.1028,
+      "step": 30
+    },
+    {
+      "epoch": 8.42,
+      "learning_rate": 0.017333333333333336,
+      "loss": 0.0617,
+      "step": 40
+    },
+    {
+      "epoch": 10.53,
+      "learning_rate": 0.016666666666666666,
+      "loss": 0.0309,
+      "step": 50
+    },
+    {
+      "epoch": 12.63,
+      "learning_rate": 0.016,
+      "loss": 0.015,
+      "step": 60
+    },
+    {
+      "epoch": 14.74,
+      "learning_rate": 0.015333333333333334,
+      "loss": 0.0086,
+      "step": 70
+    },
+    {
+      "epoch": 16.84,
+      "learning_rate": 0.014666666666666666,
+      "loss": 0.0048,
+      "step": 80
+    },
+    {
+      "epoch": 18.95,
+      "learning_rate": 0.013999999999999999,
+      "loss": 0.0043,
+      "step": 90
+    },
+    {
+      "epoch": 21.05,
+      "learning_rate": 0.013333333333333332,
+      "loss": 0.0032,
+      "step": 100
+    },
+    {
+      "epoch": 23.16,
+      "learning_rate": 0.012666666666666666,
+      "loss": 0.003,
+      "step": 110
+    },
+    {
+      "epoch": 25.26,
+      "learning_rate": 0.012,
+      "loss": 0.0029,
+      "step": 120
+    },
+    {
+      "epoch": 27.37,
+      "learning_rate": 0.011333333333333332,
+      "loss": 0.0027,
+      "step": 130
+    },
+    {
+      "epoch": 29.47,
+      "learning_rate": 0.010666666666666666,
+      "loss": 0.0024,
+      "step": 140
+    },
+    {
+      "epoch": 31.58,
+      "learning_rate": 0.01,
+      "loss": 0.0024,
+      "step": 150
+    },
+    {
+      "epoch": 33.68,
+      "learning_rate": 0.009333333333333334,
+      "loss": 0.0022,
+      "step": 160
+    },
+    {
+      "epoch": 35.79,
+      "learning_rate": 0.008666666666666668,
+      "loss": 0.002,
+      "step": 170
+    },
+    {
+      "epoch": 37.89,
+      "learning_rate": 0.008,
+      "loss": 0.0021,
+      "step": 180
+    },
+    {
+      "epoch": 40.0,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.0018,
+      "step": 190
+    },
+    {
+      "epoch": 42.11,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0018,
+      "step": 200
+    },
+    {
+      "epoch": 44.21,
+      "learning_rate": 0.006,
+      "loss": 0.002,
+      "step": 210
+    },
+    {
+      "epoch": 46.32,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0019,
+      "step": 220
+    },
+    {
+      "epoch": 48.42,
+      "learning_rate": 0.004666666666666667,
+      "loss": 0.0021,
+      "step": 230
+    },
+    {
+      "epoch": 50.53,
+      "learning_rate": 0.004,
+      "loss": 0.0016,
+      "step": 240
+    },
+    {
+      "epoch": 52.63,
+      "learning_rate": 0.003333333333333333,
+      "loss": 0.0019,
+      "step": 250
+    },
+    {
+      "epoch": 54.74,
+      "learning_rate": 0.0026666666666666666,
+      "loss": 0.0019,
+      "step": 260
+    },
+    {
+      "epoch": 56.84,
+      "learning_rate": 0.002,
+      "loss": 0.0018,
+      "step": 270
+    },
+    {
+      "epoch": 58.95,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.0021,
+      "step": 280
+    },
+    {
+      "epoch": 61.05,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.0017,
+      "step": 290
+    },
+    {
+      "epoch": 63.16,
+      "learning_rate": 0.0,
+      "loss": 0.0018,
+      "step": 300
+    }
+  ],
+  "max_steps": 300,
+  "num_train_epochs": 75,
+  "total_flos": 3.47740516122624e+16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/model_2/training_args.bin b/model_2/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..eaa08a1142bb4e5aba38a047160f38fad70079a2
--- /dev/null
+++ b/model_2/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e93f8f5f3215846cf060255f2b122e6b3dfa6780798e083bc4e8f5f4a490839
+size 3707
diff --git a/modeling_chatglm.py b/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc291119053cdf164b697ad022735f916fb3e8b1
--- /dev/null
+++ b/modeling_chatglm.py
@@ -0,0 +1,1435 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import os
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm-6b",
+    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
+]
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+                for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = inv_freq.half()
+        self.learnable = learnable
+        if learnable:
+            self.inv_freq = torch.nn.Parameter(inv_freq)
+            self.max_seq_len_cached = None
+        else:
+            self.register_buffer('inv_freq', inv_freq)
+            self.max_seq_len_cached = None
+            self.cos_cached = None
+            self.sin_cached = None
+        self.precision = precision
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        pass
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
+            self.max_seq_len_cached = None if self.learnable else seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+
+            # [sx, 1 (b * np), hn]
+            cos_cached = emb.cos()[:, None, :]
+            sin_cached = emb.sin()[:, None, :]
+            if self.precision == torch.bfloat16:
+                cos_cached = cos_cached.bfloat16()
+                sin_cached = sin_cached.bfloat16()
+            if self.learnable:
+                return cos_cached, sin_cached
+            self.cos_cached, self.sin_cached = cos_cached, sin_cached
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+
+    def _apply(self, fn):
+        if self.cos_cached is not None:
+            self.cos_cached = fn(self.cos_cached)
+        if self.sin_cached is not None:
+            self.sin_cached = fn(self.sin_cached)
+        return super()._apply(fn)
+
+
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
+
+
+@torch.jit.script
+def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
+    # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
+    cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), \
+        F.embedding(position_id, sin.squeeze(1)).unsqueeze(2)
+    q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+    return q, k
+
+
+def attention_fn(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        attention_mask,
+        hidden_size_per_partition,
+        layer_id,
+        layer_past=None,
+        scaling_attention_score=True,
+        use_cache=False,
+):
+    if layer_past is not None:
+        past_key, past_value = layer_past[0], layer_past[1]
+        key_layer = torch.cat((past_key, key_layer), dim=0)
+        value_layer = torch.cat((past_value, value_layer), dim=0)
+
+    # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
+    seq_len, b, nh, hidden_size = key_layer.shape
+
+    if use_cache:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+
+    query_key_layer_scaling_coeff = float(layer_id + 1)
+    if scaling_attention_score:
+        query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
+
+    # ===================================
+    # Raw attention scores. [b, np, s, s]
+    # ===================================
+
+    # [b, np, sq, sk]
+    output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+    # [sq, b, np, hn] -> [sq, b * np, hn]
+    query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+    # [sk, b, np, hn] -> [sk, b * np, hn]
+    key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+    matmul_result = torch.zeros(
+        1, 1, 1,
+        dtype=query_layer.dtype,
+        device=query_layer.device,
+    )
+
+    matmul_result = torch.baddbmm(
+        matmul_result,
+        query_layer.transpose(0, 1),  # [b * np, sq, hn]
+        key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+        beta=0.0,
+        alpha=1.0,
+    )
+
+    # change view to [b, np, sq, sk]
+    attention_scores = matmul_result.view(*output_size)
+
+    if self.scale_mask_softmax:
+        self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
+        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
+    else:
+        if not (attention_mask == 0).all():
+            # if auto-regressive, skip
+            attention_scores.masked_fill_(attention_mask, -10000.0)
+        dtype = attention_scores.dtype
+        attention_scores = attention_scores.float()
+        attention_scores = attention_scores * query_key_layer_scaling_coeff
+
+        attention_probs = F.softmax(attention_scores, dim=-1)
+
+        attention_probs = attention_probs.type(dtype)
+
+    # =========================
+    # Context layer. [sq, b, hp]
+    # =========================
+
+    # value_layer -> context layer.
+    # [sk, b, np, hn] --> [b, np, sq, hn]
+
+    # context layer shape: [b, np, sq, hn]
+    output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+
+    # change view [sk, b * np, hn]
+    value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+
+    # change view [b * np, sq, sk]
+    attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+    # matmul: [b * np, sq, hn]
+    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+    # change view [b, np, sq, hn]
+    context_layer = context_layer.view(*output_size)
+
+    # [b, np, sq, hn] --> [sq, b, np, hn]
+    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+    # [sq, b, np, hn] --> [sq, b, hp]
+    new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
+    context_layer = context_layer.view(*new_context_layer_shape)
+
+    outputs = (context_layer, present, attention_probs)
+
+    return outputs
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class SelfAttention(torch.nn.Module):
+    def __init__(self, hidden_size, num_attention_heads,
+                 layer_id, hidden_size_per_attention_head=None, bias=True,
+                 params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        super(SelfAttention, self).__init__()
+
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.hidden_size_per_partition = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_heads_per_partition = num_attention_heads
+        self.position_encoding_2d = position_encoding_2d
+        self.rotary_emb = RotaryEmbedding(
+            self.hidden_size // (self.num_attention_heads * 2)
+            if position_encoding_2d
+            else self.hidden_size // self.num_attention_heads,
+            base=10000,
+            precision=torch.half,
+            learnable=False,
+        )
+
+        self.scale_mask_softmax = None
+
+        if hidden_size_per_attention_head is None:
+            self.hidden_size_per_attention_head = hidden_size // num_attention_heads
+        else:
+            self.hidden_size_per_attention_head = hidden_size_per_attention_head
+
+        self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
+
+        # Strided linear layer.
+        self.query_key_value = init_method(
+            torch.nn.Linear,
+            hidden_size,
+            3 * self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+        self.dense = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    @staticmethod
+    def attention_mask_func(attention_scores, attention_mask):
+        attention_scores.masked_fill_(attention_mask, -10000.0)
+        return attention_scores
+
+    def split_tensor_along_last_dim(self, tensor, num_partitions,
+                                    contiguous_split_chunks=False):
+        """Split a tensor along its last dimension.
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                    in memory.
+        """
+        # Get the size and dimension.
+        last_dim = tensor.dim() - 1
+        last_dim_size = tensor.size()[last_dim] // num_partitions
+        # Split.
+        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+        # Note: torch.split does not create contiguous tensors by default.
+        if contiguous_split_chunks:
+            return tuple(chunk.contiguous() for chunk in tensor_list)
+
+        return tensor_list
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # [seq_len, batch, 3 * hidden_size]
+        mixed_raw_layer = self.query_key_value(hidden_states)
+
+        # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
+        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
+
+        # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+        (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3)
+
+        if self.position_encoding_2d:
+            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
+            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
+            cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
+            position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
+                position_ids[:, 1, :].transpose(0, 1).contiguous()
+            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
+            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
+            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
+            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
+        else:
+            position_ids = position_ids.transpose(0, 1)
+            cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
+            # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
+            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
+
+        # [seq_len, batch, hidden_size]
+        context_layer, present, attention_probs = attention_fn(
+            self=self,
+            query_layer=query_layer,
+            key_layer=key_layer,
+            value_layer=value_layer,
+            attention_mask=attention_mask,
+            hidden_size_per_partition=self.hidden_size_per_partition,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache
+        )
+
+        output = self.dense(context_layer)
+
+        outputs = (output, present)
+
+        if output_attentions:
+            outputs += (attention_probs,)
+
+        return outputs  # output, present, attention_probs
+
+
+class GEGLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.activation_fn = F.gelu
+
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+
+
+class GLU(torch.nn.Module):
+    def __init__(self, hidden_size, inner_hidden_size=None,
+                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
+        super(GLU, self).__init__()
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        self.layer_id = layer_id
+        self.activation_func = activation_func
+
+        # Project to 4h.
+        self.hidden_size = hidden_size
+        if inner_hidden_size is None:
+            inner_hidden_size = 4 * hidden_size
+        self.inner_hidden_size = inner_hidden_size
+        self.dense_h_to_4h = init_method(
+            torch.nn.Linear,
+            self.hidden_size,
+            self.inner_hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+        # Project back to h.
+        self.dense_4h_to_h = init_method(
+            torch.nn.Linear,
+            self.inner_hidden_size,
+            self.hidden_size,
+            bias=bias,
+            dtype=params_dtype,
+        )
+
+    def forward(self, hidden_states):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        """
+
+        # [seq_len, batch, inner_hidden_size]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        output = self.dense_4h_to_h(intermediate_parallel)
+
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    def __init__(
+            self,
+            hidden_size,
+            num_attention_heads,
+            layernorm_epsilon,
+            layer_id,
+            inner_hidden_size=None,
+            hidden_size_per_attention_head=None,
+            layernorm=LayerNorm,
+            use_bias=True,
+            params_dtype=torch.float,
+            num_layers=28,
+            position_encoding_2d=True,
+            empty_init=True
+    ):
+        super(GLMBlock, self).__init__()
+        # Set output layer initialization if not provided.
+
+        self.layer_id = layer_id
+
+        # Layernorm on the input data.
+        self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.position_encoding_2d = position_encoding_2d
+
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            layer_id,
+            hidden_size_per_attention_head=hidden_size_per_attention_head,
+            bias=use_bias,
+            params_dtype=params_dtype,
+            position_encoding_2d=self.position_encoding_2d,
+            empty_init=empty_init
+        )
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
+
+        self.num_layers = num_layers
+
+        # GLU
+        self.mlp = GLU(
+            hidden_size,
+            inner_hidden_size=inner_hidden_size,
+            bias=use_bias,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            empty_init=empty_init
+        )
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            position_ids,
+            attention_mask: torch.Tensor,
+            layer_id,
+            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+            output_attentions: bool = False,
+    ):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        attention_mask: [(1, 1), seq_len, seq_len]
+        """
+
+        # Layer norm at the begining of the transformer layer.
+        # [seq_len, batch, hidden_size]
+        attention_input = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_outputs = self.attention(
+            attention_input,
+            position_ids,
+            attention_mask=attention_mask,
+            layer_id=layer_id,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        outputs = attention_outputs[1:]
+
+        # Residual connection.
+        alpha = (2 * self.num_layers) ** 0.5
+        hidden_states = attention_input * alpha + attention_output
+
+        mlp_input = self.post_attention_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output = self.mlp(mlp_input)
+
+        # Second residual connection.
+        output = mlp_input * alpha + mlp_output
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
+        attention_mask.tril_()
+        for i, context_length in enumerate(context_lengths):
+            attention_mask[i, :, :context_length] = 1
+        attention_mask.unsqueeze_(1)
+        attention_mask = (attention_mask < 0.5).bool()
+
+        return attention_mask
+
+    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
+        batch_size, seq_length = input_ids.shape
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
+        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
+        if self.position_encoding_2d:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                position_ids[i, context_length:] = mask_positions[i]
+            block_position_ids = [torch.cat((
+                torch.zeros(context_length, dtype=torch.long, device=device),
+                torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
+            )) for context_length in context_lengths]
+            block_position_ids = torch.stack(block_position_ids, dim=0)
+            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
+        else:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
+                    position_ids[context_length:] = mask_positions[i]
+
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ChatGLMModel):
+            module.gradient_checkpointing = value
+
+
+CHATGLM_6B_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CHATGLM_6B_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ChatGLM6BTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.",
+    CHATGLM_6B_START_DOCSTRING,
+)
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        # recording parameters
+        self.max_sequence_length = config.max_sequence_length
+        self.hidden_size = config.hidden_size
+        self.params_dtype = torch.half
+        self.num_attention_heads = config.num_attention_heads
+        self.vocab_size = config.vocab_size
+        self.num_layers = config.num_layers
+        self.layernorm_epsilon = config.layernorm_epsilon
+        self.inner_hidden_size = config.inner_hidden_size
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        self.position_encoding_2d = config.position_encoding_2d
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+
+        self.word_embeddings = init_method(
+            torch.nn.Embedding,
+            num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
+            dtype=self.params_dtype
+        )
+        self.gradient_checkpointing = False
+
+        def get_layer(layer_id):
+            return GLMBlock(
+                self.hidden_size,
+                self.num_attention_heads,
+                self.layernorm_epsilon,
+                layer_id,
+                inner_hidden_size=self.inner_hidden_size,
+                hidden_size_per_attention_head=self.hidden_size_per_attention_head,
+                layernorm=LayerNorm,
+                use_bias=True,
+                params_dtype=self.params_dtype,
+                position_encoding_2d=self.position_encoding_2d,
+                empty_init=empty_init
+            )
+
+        self.layers = torch.nn.ModuleList(
+            [get_layer(layer_id) for layer_id in range(self.num_layers)]
+        )
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
+
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+            # total_params = sum(p.numel() for p in self.parameters())
+            # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+            # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params))
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.num_attention_heads,
+            self.hidden_size // self.num_attention_heads
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        # past_key_values = [(v[0], v[1]) for v in past_key_values]
+        return past_key_values
+
+    @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if past_key_values is None:
+            if self.pre_seq_len is not None:
+                past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            else:
+                past_key_values = tuple([None] * len(self.layers))
+
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+
+
+            if position_ids is None:
+                MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+                seqs = input_ids.tolist()
+
+                mask_positions, use_gmasks = [], []
+                for seq in seqs:
+                    mask_token = gMASK if gMASK in seq else MASK
+                    use_gmask = mask_token == gMASK
+                    mask_positions.append(seq.index(mask_token))
+                    use_gmasks.append(use_gmask)
+
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    mask_positions=mask_positions,
+                    device=input_ids.device,
+                    use_gmasks=use_gmasks
+                )
+
+        if self.pre_seq_len is not None and attention_mask is not None:
+            prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
+                attention_mask.device)
+            prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
+
+        # [seq_len, batch, hidden_size]
+        hidden_states = inputs_embeds.transpose(0, 1)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if attention_mask is None:
+            attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        for i, layer in enumerate(self.layers):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_past = past_key_values[i]
+
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    torch.tensor(i),
+                    layer_past,
+                    use_cache,
+                    output_attentions
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    layer_id=torch.tensor(i),
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions
+                )
+
+            hidden_states = layer_ret[0]
+
+            if use_cache:
+                presents = presents + (layer_ret[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],)
+
+        # Final layer norm.
+        hidden_states = self.final_layernorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+
+        # self.hidden_size = config.hidden_size
+        # self.params_dtype = torch.half
+        # self.vocab_size = config.vocab_size
+        self.max_sequence_length = config.max_sequence_length
+
+        self.position_encoding_2d = config.position_encoding_2d
+
+        self.transformer = ChatGLMModel(config, empty_init=empty_init)
+
+        self.lm_head = init_method(
+            nn.Linear,
+            config.hidden_size,
+            config.vocab_size,
+            bias=False,
+            dtype=torch.half
+        )
+
+        self.config = config
+
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3)
+                new_attention_mask = attention_mask[:, :, -1:].clone()
+                new_attention_mask[..., -1] = False
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, new_attention_mask], dim=2
+                )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id[:, 1, :] += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past: Optional[torch.Tensor] = None,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            **kwargs
+    ) -> dict:
+        batch_size, seq_length = input_ids.shape
+        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+        seqs = input_ids.tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
+
+        # only last token for input_ids if past is not None
+        if past is not None or past_key_values is not None:
+            last_token = input_ids[:, -1].unsqueeze(-1)
+            if attention_mask is not None and attention_mask.dtype == torch.bool:
+                attention_mask = attention_mask[:, :, -1:]
+            else:
+                attention_mask = None
+            if position_ids is not None:
+                position_ids = position_ids[..., -1:]
+            else:
+                context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
+                if self.position_encoding_2d:
+                    position_ids = torch.tensor(
+                        [[mask_position, seq_length - context_length] for mask_position, context_length in
+                         zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
+                else:
+                    position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
+                                                device=input_ids.device).unsqueeze(-1)
+
+            if past is None:
+                past = past_key_values
+            return {
+                "input_ids": last_token,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+        else:
+            if attention_mask is not None and attention_mask.dtype != torch.bool:
+                logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
+                attention_mask = None
+            if attention_mask is None:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+            if position_ids is None:
+                position_ids = self.get_position_ids(
+                    input_ids,
+                    device=input_ids.device,
+                    mask_positions=mask_positions,
+                    use_gmasks=use_gmasks
+                )
+
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past,
+                "position_ids": position_ids,
+                "attention_mask": attention_mask
+            }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
+
+    @torch.no_grad()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
+             do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+
+    @torch.no_grad()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
+                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        for outputs in self.stream_generate(**inputs, **gen_kwargs):
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            response = self.process_response(response)
+            new_history = history + [(query, response)]
+            yield response, new_history
+
+    @torch.no_grad()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+            yield input_ids
+
+    def quantize(self, bits: int, empty_init=False, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
+        return self
diff --git a/ptuning/README.md b/ptuning/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3339ce65fc91dd46c027f36d4038e97355cb535
--- /dev/null
+++ b/ptuning/README.md
@@ -0,0 +1,248 @@
+# ChatGLM-6B-PT
+本仓库实现了对于 ChatGLM-6B 模型基于 [P-Tuning v2](https://github.com/THUDM/P-tuning-v2) 的微调。P-Tuning v2 将需要微调的参数量减少到原来的 0.1%，再通过模型量化、Gradient Checkpoint 等方法，最低只需要 7GB 显存即可运行。
+
+下面以 [ADGEN](https://aclanthology.org/D19-1321.pdf) (广告生成) 数据集为例介绍代码的使用方法。
+
+*Read this in [English](README_en.md).*
+
+## 软件依赖
+运行微调需要4.27.1版本的`transformers`。除 ChatGLM-6B 的依赖之外，还需要安装以下依赖
+```
+pip install rouge_chinese nltk jieba datasets
+```
+## 使用方法
+
+### 下载数据集
+ADGEN 数据集任务为根据输入（content）生成一段广告词（summary）。
+
+```json
+{
+    "content": "类型#上衣*版型#宽松*版型#显瘦*图案#线条*衣样式#衬衫*衣袖型#泡泡袖*衣款式#抽绳",
+    "summary": "这件衬衫的款式非常的宽松，利落的线条可以很好的隐藏身材上的小缺点，穿在身上有着很好的显瘦效果。领口装饰了一个可爱的抽绳，漂亮的绳结展现出了十足的个性，配合时尚的泡泡袖型，尽显女性甜美可爱的气息。"
+}
+```
+
+从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing) 或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载处理好的 ADGEN 数据集，将解压后的 `AdvertiseGen` 目录放到本目录下。
+
+### 训练
+
+#### P-tuning v2
+
+运行以下指令进行训练：
+```shell
+bash train.sh
+```
+`train.sh` 中的 `PRE_SEQ_LEN` 和 `LR` 分别是 soft prompt 长度和训练的学习率，可以进行调节以取得最佳的效果。P-Tuning-v2 方法会冻结全部的模型参数，可通过调整 `quantization_bit` 来被原始模型的量化等级，不加此选项则为 FP16 精度加载。
+
+在默认配置 `quantization_bit=4`、`per_device_train_batch_size=1`、`gradient_accumulation_steps=16` 下，INT4 的模型参数被冻结，一次训练迭代会以 1 的批处理大小进行 16 次累加的前后向传播，等效为 16 的总批处理大小，此时最低只需 6.7G 显存。若想在同等批处理大小下提升训练效率，可在二者乘积不变的情况下，加大 `per_device_train_batch_size` 的值，但也会带来更多的显存消耗，请根据实际情况酌情调整。
+
+如果你想要[从本地加载模型](https://github.com/THUDM/ChatGLM-6B#%E4%BB%8E%E6%9C%AC%E5%9C%B0%E5%8A%A0%E8%BD%BD%E6%A8%A1%E5%9E%8B)，可以将 `train.sh` 中的 `THUDM/chatglm-6b` 改为你本地的模型路径。
+
+#### Finetune
+
+如果需要进行全参数的 Finetune，需要安装 [Deepspeed](https://github.com/microsoft/DeepSpeed)，然后运行以下指令：
+
+```shell
+bash ds_train_finetune.sh
+```
+
+### 推理
+
+将 `evaluate.sh` 中的 `CHECKPOINT` 更改为训练时保存的 checkpoint 名称，运行以下指令进行模型推理和评测：
+```shell
+bash evaluate.sh
+```
+**[2023/04/10更新]** 在 P-tuning v2 训练时模型只保存 PrefixEncoder 部分的参数，所以在推理时需要同时加载原 ChatGLM-6B 模型以及 PrefixEncoder 的权重，因此需要指定参数（已更新 `evaluate.sh`） ：
+
+```shell
+--model_name_or_path THUDM/chatglm-6b
+--ptuning_checkpoint $CHECKPOINT_PATH
+```
+
+仍然兼容旧版全参保存的 Checkpoint，只需要跟之前一样设定 `model_name_or_path`：
+
+```shell
+--model_name_or_path $CHECKPOINT_PATH
+```
+
+评测指标为中文 Rouge score 和 BLEU-4。生成的结果保存在
+`./output/adgen-chatglm-6b-pt-8-1e-2/generated_predictions.txt`。
+
+### 例子
+#### 示例1
+* Input: 类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞
+* Label: 简约而不简单的牛仔外套,白色的衣身十分百搭。衣身多处有做旧破洞设计,打破单调乏味,增加一丝造型看点。衣身后背处有趣味刺绣装饰,丰富层次感,彰显别样时尚。
+* Output[微调前]: 这件上衣的材质是牛仔布,颜色是白色,风格是简约,图案是刺绣,衣样式是外套,衣款式是破洞。
+* Output[微调后]: 这是一款简约的牛仔外套,破洞设计,将牛仔布破洞,带来一种随意与个性。破洞的牛仔外套,展现出时尚气息,带来一种休闲感。同时,刺绣图案,让整件外套更加立体。
+
+#### 示例2
+
+* Input: 类型#裙\*版型#显瘦\*风格#文艺\*风格#简约\*图案#印花\*图案#撞色\*裙下摆#压褶\*裙长#连衣裙\*裙领型#圆领
+* Label: 文艺个性的印花连衣裙,藏青色底蕴,低调又大气,撞色太阳花分布整个裙身,绚丽而美好,带来时尚减龄的气质。基础款的舒适圆领,简约不失大方,勾勒精致脸庞。领后是一粒包布扣固定,穿脱十分方便。前片立体的打褶设计,搭配后片压褶的做工,增添层次和空间感,显瘦又有型。
+* Output[微调前]: 类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领 1\. 连衣裙:简约风格,裙长为膝盖以上,裙领型为圆领。2\. 裙下摆:压褶设计,使裙摆呈现出流畅的褶皱效果。3\. 裙领型:裙领型为圆领,使穿上连衣裙后更加有型。4\. 版型:采用显瘦设计,让连衣裙看起来更加苗条。5\. 风格:文艺风格,让连衣裙更加有内涵和品味。6\. 图案:印花设计,在连衣裙上印有独特的图案。7\. 撞色:采用撞色设计,让连衣裙在色彩上更加鲜明、富有层次感。
+* Output[微调后]: 这是一款文艺范的连衣裙,以印花为元素,采用简约的印花,既能够突出文艺气质,又能够展现简约风。在印花的同时又有领子和裙摆的压褶设计,更加凸显文艺气质。简约而不会过于单调,搭配出街,穿着十分舒适。
+
+### 评估结果
+
+|               | Finetune | P-tuning v2 | LoRA |
+| ------------- | ----------- | ----- | ------------- |
+| BLEU-4        | 8.01    | 8.10 | 7.62 |
+| Rouge-1       | 31.23  | 31.12 | 30.60 |
+| Rouge-2       | 7.36    | 7.11 | 6.96 |
+| Rouge-l       | 25.08  | 24.97 | 24.80 |
+| Training Loss | 3.00 | 3.74 | 3.32 |
+
+
+
+#### 实验设置
+
+ ```
+max_source_length=64
+max_target_length=64
+max_steps=3000
+ ```
+
+##### P-tuning v2
+
+```
+pre_seq_len=128
+learning_rate=2e-2
+quantization_bit=4
+per_device_train_batch_size=16
+gradient_accumulation_steps=1
+```
+
+##### Finetune
+
+```
+learning_rate=1e-4
+fp16
+num_gpus=4
+per_device_train_batch_size=4
+gradient_accumulation_steps=1
+```
+
+##### LoRA
+
+实现采用的是 [simple_thu_chatglm6b](https://github.com/yuanzhoulvpi2017/zero_nlp/tree/main/simple_thu_chatglm6b)
+
+```
+learning_rate=5e-4
+per_device_train_batch_size=16
+gradient_accumulation_steps=1
+```
+
+
+
+## 模型部署
+首先载入Tokenizer：
+
+```python
+import os
+import torch
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+
+# 载入Tokenizer
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+```
+
+1. 如果需要加载的是新 Checkpoint（只包含 PrefixEncoder 参数）：
+
+```python
+config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128)
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True)
+prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
+new_prefix_state_dict = {}
+for k, v in prefix_state_dict.items():
+    if k.startswith("transformer.prefix_encoder."):
+        new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
+model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
+```
+注意你可能需要将 `pre_seq_len` 改成你训练时的实际值。如果你是[从本地加载模型](https://github.com/THUDM/ChatGLM-6B#%E4%BB%8E%E6%9C%AC%E5%9C%B0%E5%8A%A0%E8%BD%BD%E6%A8%A1%E5%9E%8B)的话，需要将 `THUDM/chatglm-6b` 改成本地的模型路径（注意不是checkpoint路径）。
+
+2. 如果需要加载的是旧 Checkpoint（包含 ChatGLM-6B 以及 PrefixEncoder 参数），或者进行的是全参数微调，则直接加载整个 Checkpoint：
+
+```python
+model = AutoModel.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)
+```
+
+之后根据需求可以进行量化，也可以直接使用：
+
+```python
+# Comment out the following line if you don't use quantization
+model = model.quantize(4)
+model = model.half().cuda()
+model.transformer.prefix_encoder.float()
+model = model.eval()
+
+response, history = model.chat(tokenizer, "你好", history=[])
+```
+
+## 使用自己的数据集
+修改 `train.sh` 和 `evaluate.sh` 中的 `train_file`、`validation_file`和`test_file`为你自己的 JSON 格式数据集路径，并将 `prompt_column` 和 `response_column` 改为 JSON 文件中输入文本和输出文本对应的 KEY。可能还需要增大 `max_source_length` 和 `max_target_length` 来匹配你自己的数据集中的最大输入输出长度。
+
+## 对话数据集
+
+如需要使用多轮对话数据对模型进行微调，可以提供聊天历史，例如
+
+```json
+{
+    "prompt": "是的。上下水管都好的",
+    "response": "那就要检查线路了，一般风扇继电器是由电脑控制吸合的，如果电路存在断路，或者电脑坏了的话会出现继电器不吸合的情况！",
+    "history": [
+        [
+            "长城h3风扇不转。继电器好的。保险丝好的传感器新的风扇也新的这是为什么。就是继电器缺一个信号线",
+            "用电脑能读数据流吗？水温多少"
+        ],
+        [
+            "95",
+            "上下水管温差怎么样啊？空气是不是都排干净了呢？"
+        ]
+    ]
+}
+```
+
+训练时需要指定 `--history_column` 为数据中聊天历史的 key（在此例子中是 `history`），将自动把聊天历史拼接，例如：
+
+- Input
+
+  ```
+  [Round 0]
+  问:长城h3风扇不转。继电器好的。保险丝好的传感器新的风扇也新的这是为什么。就是继电器缺一个信号线
+  答:用电脑能读数据流吗?水温多少
+  [Round 1]
+  问:95
+  答:上下水管温差怎么样啊?空气是不是都排干净了呢?
+  [Round 2]
+  问:是的。上下水管都好的
+  答:
+  ```
+
+- Label
+
+  ```
+  那就要检查线路了,一般风扇继电器是由电脑控制吸合的,如果电路存在断路,或者电脑坏了的话会出现继电器不吸合的情况!
+  ```
+
+要注意超过输入长度 `max_source_length` 的内容会被截。
+
+可以参考以下指令：
+
+```shell
+bash train_chat.sh
+```
+
+## 引用
+
+```
+@inproceedings{liu2022p,
+  title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
+  author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
+  pages={61--68},
+  year={2022}
+}
+```
+
+
+
diff --git a/ptuning/README_en.md b/ptuning/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9282da32c467eb17316c05b65a5522f99d149340
--- /dev/null
+++ b/ptuning/README_en.md
@@ -0,0 +1,115 @@
+# ChatGLM-6B-PT
+This repository implements tuning of the ChatGLM-6B model based on [P-Tuning v2](https://github.com/THUDM/P-tuning-v2). P-Tuning v2 reduces the amount of parameters that need to be optimized to 0.1% of the full fine-tuning, and then through model quantization, Gradient Checkpoint and other methods, it only needs a minimum of 7GB of video memory to run.
+
+The following uses the [ADGEN](https://aclanthology.org/D19-1321.pdf) (advertising generation) dataset as an example to introduce how to use the code.
+
+## Software dependencies
+Running p-tuning requires version 4.27.1 of `transformers`. In addition to the dependencies of ChatGLM-6B, the following dependencies are required
+```
+pip install rouge_chinese nltk jieba datasets
+```
+## Instructions
+
+### Download the dataset
+The task of the ADGEN dataset is to generate an advertisement word (summary) based on the input (content).
+
+```json
+{
+    "content": "类型#上衣*版型#宽松*版型#显瘦*图案#线条*衣样式#衬衫*衣袖型#泡泡袖*衣款式#抽绳",
+    "summary": "这件衬衫的款式非常的宽松，利落的线条可以很好的隐藏身材上的小缺点，穿在身上有着很好的显瘦效果。领口装饰了一个可爱的抽绳，漂亮的绳结展现出了十足的个性，配合时尚的泡泡袖型，尽显女性甜美可爱的气息。"
+}
+```
+
+From [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing) or [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) Download the processed ADGEN dataset, and put the decompressed `AdvertiseGen` directory into this directory.
+
+### Training
+Run the following commands for training:
+```shell
+bash train.sh
+```
+`PRE_SEQ_LEN` and `LR` in `train.sh` are soft prompt length and training learning rate respectively, which can be adjusted to achieve the best results. The P-Tuning-v2 method will freeze all model parameters, and the quantization level of the original model can be adjusted by adjusting `quantization_bit`. If this option is not added, it will be loaded with FP16 precision.
+
+Under the default configuration of `per_device_train_batch_size=1`, `gradient_accumulation_steps=16`, the model parameters of INT4 are frozen, and a training iteration will perform 16 cumulative forward and backward propagations with a batch size of 1, which is equivalent to the total batch size of 16, and only 6.7G GPU memory is required at this time with `quantization_bit=4`. If you want to improve the training efficiency under the same batch size, you can increase the value of `per_device_train_batch_size` while keeping the product of the two unchanged, but it will also bring more GPU memory consumption, please adjust it according to the actual situation.
+
+### Inference
+
+Change `CHECKPOINT` in `evaluate.sh` to the checkpoint name saved during training, and run the following commands for model inference and evaluation:
+```shell
+bash evaluate.sh
+```
+
+The evaluation indicators are Chinese Rouge score and BLEU-4. The generated results are saved in
+`./output/adgen-chatglm-6b-pt-8-1e-2/generated_predictions.txt`.
+
+### Example
+#### Example 1
+* Input: 类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞
+* Label: 简约而不简单的牛仔外套,白色的衣身十分百搭。衣身多处有做旧破洞设计,打破单调乏味,增加一丝造型看点。衣身后背处有趣味刺绣装饰,丰富层次感,彰显别样时尚。
+* Output[微调前]: 这件上衣的材质是牛仔布,颜色是白色,风格是简约,图案是刺绣,衣样式是外套,衣款式是破洞。
+* Output[微调后]: 这是一款简约的牛仔外套,破洞设计,将牛仔布破洞,带来一种随意与个性。破洞的牛仔外套,展现出时尚气息,带来一种休闲感。同时,刺绣图案,让整件外套更加立体。
+
+#### Example 2
+
+* Input: 类型#裙\*版型#显瘦\*风格#文艺\*风格#简约\*图案#印花\*图案#撞色\*裙下摆#压褶\*裙长#连衣裙\*裙领型#圆领
+* Label: 文艺个性的印花连衣裙,藏青色底蕴,低调又大气,撞色太阳花分布整个裙身,绚丽而美好,带来时尚减龄的气质。基础款的舒适圆领,简约不失大方,勾勒精致脸庞。领后是一粒包布扣固定,穿脱十分方便。前片立体的打褶设计,搭配后片压褶的做工,增添层次和空间感,显瘦又有型。
+* Output[微调前]: 类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领 1\. 连衣裙:简约风格,裙长为膝盖以上,裙领型为圆领。2\. 裙下摆:压褶设计,使裙摆呈现出流畅的褶皱效果。3\. 裙领型:裙领型为圆领,使穿上连衣裙后更加有型。4\. 版型:采用显瘦设计,让连衣裙看起来更加苗条。5\. 风格:文艺风格,让连衣裙更加有内涵和品味。6\. 图案:印花设计,在连衣裙上印有独特的图案。7\. 撞色:采用撞色设计,让连衣裙在色彩上更加鲜明、富有层次感。
+* Output[微调后]: 这是一款文艺范的连衣裙,以印花为元素,采用简约的印花,既能够突出文艺气质,又能够展现简约风。在印花的同时又有领子和裙摆的压褶设计,更加凸显文艺气质。简约而不会过于单调,搭配出街,穿着十分舒适。
+
+### evaluation result
+
+| | P-tuning v2 | LoRA |
+| ------- | ----------- | ----- |
+| BLEU-4 | 7.71 | 6.13 |
+| Rouge-1 | 31.35 | 28.36 |
+| Rouge-2 | 7.19 | 4.38 |
+| Rouge-l | 25.17 | 17.54 |
+
+#### Experiment Settings
+
+  ```
+max_source_length=64
+max_target_length=64
+per_device_train_batch_size=1
+gradient_accumulation_steps=16
+max_steps=3000
+  ```
+
+##### P-tuning v2
+
+```
+pre_seq_len=128
+learning_rate=2e-2
+quantization_bit=4
+```
+
+##### LoRA
+
+```
+learning_rate=5e-4
+```
+
+The implementation uses [simple_thu_chatglm6b](https://github.com/yuanzhoulvpi2017/zero_nlp/tree/main/simple_thu_chatglm6b)
+
+
+
+## Model Deployment
+Replace `THUDM/chatglm-6b` in the corresponding demo or code with the path of the checkpoint after P-Tuning(in the example, `./output/adgen-chatglm-6b-pt-8-1e-2/ checkpoint-3000`). Note that the current fine-tuning does not support multiple rounds of data, so only the responses from the first round of the conversation are fine-tuned.
+
+## Use your own dataset
+Modify `train_file`, `validation_file` and `test_file` in `train.sh` and `evaluate.sh` to your own JSON format dataset paths, and change `prompt_column` and `response_column` to the keys in the JSON file corresponding to input text and output text.
+
+## TODO
+* [ ] Support for chat data
+* [ ] Support for full finetuning
+
+## quoting
+
+```
+@inproceedings{liu2022p,
+   title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
+   author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
+   booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
+   pages={61--68},
+   year={2022}
+}
+```
\ No newline at end of file
diff --git a/ptuning/__pycache__/arguments.cpython-39.pyc b/ptuning/__pycache__/arguments.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45a3858465d9743b73792b89a21d775d7e2f6030
Binary files /dev/null and b/ptuning/__pycache__/arguments.cpython-39.pyc differ
diff --git a/ptuning/__pycache__/trainer.cpython-39.pyc b/ptuning/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef0baee25dc423ef00d450802449d5acf3940600
Binary files /dev/null and b/ptuning/__pycache__/trainer.cpython-39.pyc differ
diff --git a/ptuning/__pycache__/trainer_seq2seq.cpython-39.pyc b/ptuning/__pycache__/trainer_seq2seq.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..204f727f1d08d4b684c91e9f686e4bfad9291830
Binary files /dev/null and b/ptuning/__pycache__/trainer_seq2seq.cpython-39.pyc differ
diff --git a/ptuning/arguments.py b/ptuning/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda1f3522261f50768984402d9ac691557ea63f3
--- /dev/null
+++ b/ptuning/arguments.py
@@ -0,0 +1,224 @@
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    ptuning_checkpoint: str = field(
+        default=None, metadata={"help": "Path to p-tuning v2 checkpoints"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    resize_position_embeddings: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
+                "the model's position embeddings."
+            )
+        },
+    )
+    quantization_bit: Optional[int] = field(
+        default=None
+    )
+    pre_seq_len: Optional[int] = field(
+        default=None
+    )
+    prefix_projection: bool = field(
+        default=False
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    prompt_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    response_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
+    )
+    history_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the history of chat."},
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
+            )
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total sequence length for target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to pad all samples to model maximum sentence length. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                "efficient on GPU but very bad for TPU."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+                "which is used during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    source_prefix: Optional[str] = field(
+        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+    )
+
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to force as the first generated token after the decoder_start_token_id."
+                "Useful for multilingual models like mBART where the first generated token"
+                "needs to be the target language token (Usually it is the target language token)"
+            )
+        },
+    )
+
+    
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None and self.test_file is None:
+            raise ValueError("Need either a dataset name or a training/validation/test file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+
diff --git a/ptuning/deepspeed.json b/ptuning/deepspeed.json
new file mode 100644
index 0000000000000000000000000000000000000000..798932966f38b2df8a468c72a4b41d8b47033ccc
--- /dev/null
+++ b/ptuning/deepspeed.json
@@ -0,0 +1,21 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  }
+}
\ No newline at end of file
diff --git a/ptuning/ds_train_finetune.sh b/ptuning/ds_train_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..531a8004dbed00819aa767c420cdc483e7c0abed
--- /dev/null
+++ b/ptuning/ds_train_finetune.sh
@@ -0,0 +1,28 @@
+
+LR=1e-4
+
+MASTER_PORT=$(shuf -n 1 -i 10000-65535)
+
+deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \
+    --deepspeed deepspeed.json \
+    --do_train \
+    --train_file AdvertiseGen/train.json \
+    --test_file AdvertiseGen/dev.json \
+    --prompt_column content \
+    --response_column summary \
+    --overwrite_cache \
+    --model_name_or_path THUDM/chatglm-6b \
+    --output_dir ./output/adgen-chatglm-6b-ft-$LR \
+    --overwrite_output_dir \
+    --max_source_length 64 \
+    --max_target_length 64 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --predict_with_generate \
+    --max_steps 5000 \
+    --logging_steps 10 \
+    --save_steps 1000 \
+    --learning_rate $LR \
+    --fp16
+
diff --git a/ptuning/evaluate.sh b/ptuning/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ab855367009f472c84d095b62b3c3d49a0c5518c
--- /dev/null
+++ b/ptuning/evaluate.sh
@@ -0,0 +1,21 @@
+PRE_SEQ_LEN=128
+CHECKPOINT=adgen-chatglm-6b-pt-128-2e-2
+STEP=3000
+
+CUDA_VISIBLE_DEVICES=0 python3 main.py \
+    --do_predict \
+    --validation_file AdvertiseGen/dev.json \
+    --test_file AdvertiseGen/dev.json \
+    --overwrite_cache \
+    --prompt_column content \
+    --response_column summary \
+    --model_name_or_path THUDM/chatglm-6b \
+    --ptuning_checkpoint ./output/$CHECKPOINT/checkpoint-$STEP \
+    --output_dir ./output/$CHECKPOINT \
+    --overwrite_output_dir \
+    --max_source_length 64 \
+    --max_target_length 64 \
+    --per_device_eval_batch_size 1 \
+    --predict_with_generate \
+    --pre_seq_len $PRE_SEQ_LEN \
+    --quantization_bit 4
diff --git a/ptuning/evaluate_finetune.sh b/ptuning/evaluate_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e275c3cbbec9ee65ad5e4a958a0ea52c248964c4
--- /dev/null
+++ b/ptuning/evaluate_finetune.sh
@@ -0,0 +1,18 @@
+CHECKPOINT=adgen-chatglm-6b-ft-1e-4
+STEP=3000
+
+CUDA_VISIBLE_DEVICES=0 python3 main.py \
+    --do_predict \
+    --validation_file AdvertiseGen/dev.json \
+    --test_file AdvertiseGen/dev.json \
+    --overwrite_cache \
+    --prompt_column content \
+    --response_column summary \
+    --model_name_or_path ./output/$CHECKPOINT/checkpoint-$STEP  \
+    --output_dir ./output/$CHECKPOINT \
+    --overwrite_output_dir \
+    --max_source_length 256 \
+    --max_target_length 256 \
+    --per_device_eval_batch_size 1 \
+    --predict_with_generate \
+    --fp16_full_eval
diff --git a/ptuning/main.py b/ptuning/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ecdf814352cf388bd9cb319b9938d33737d59d
--- /dev/null
+++ b/ptuning/main.py
@@ -0,0 +1,431 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for sequence to sequence.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+import json
+
+import numpy as np
+from datasets import load_dataset
+import jieba 
+from rouge_chinese import Rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+import torch
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    set_seed,
+)
+from trainer_seq2seq import Seq2SeqTrainer
+
+from arguments import ModelArguments, DataTrainingArguments
+
+logger = logging.getLogger(__name__)
+
+def main():
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    # datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Load dataset
+    data_files = {}
+    if data_args.train_file is not None:
+        data_files["train"] = data_args.train_file
+        extension = data_args.train_file.split(".")[-1]
+    if data_args.validation_file is not None:
+        data_files["validation"] = data_args.validation_file
+        extension = data_args.validation_file.split(".")[-1]
+    if data_args.test_file is not None:
+        data_files["test"] = data_args.test_file
+        extension = data_args.test_file.split(".")[-1]
+
+    raw_datasets = load_dataset(
+        extension,
+        data_files=data_files,
+        cache_dir=model_args.cache_dir,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Load pretrained model and tokenizer
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+    config.pre_seq_len = model_args.pre_seq_len
+    config.prefix_projection = model_args.prefix_projection
+
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+
+    if model_args.ptuning_checkpoint is not None:
+        # Evaluation
+        # Loading extra state dict of prefix encoder
+        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
+        prefix_state_dict = torch.load(os.path.join(model_args.ptuning_checkpoint, "pytorch_model.bin"))
+        new_prefix_state_dict = {}
+        for k, v in prefix_state_dict.items():
+            if k.startswith("transformer.prefix_encoder."):
+                new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
+        model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
+    else:
+        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
+
+    if model_args.quantization_bit is not None:
+        print(f"Quantized to {model_args.quantization_bit} bit")
+        model = model.quantize(model_args.quantization_bit)
+    if model_args.pre_seq_len is not None:
+        # P-tuning v2
+        model = model.half()
+        model.transformer.prefix_encoder.float()
+    else:
+        # Finetune
+        model = model.float()
+
+    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    elif training_args.do_predict:
+        column_names = raw_datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # Get the column names for input/target.
+    prompt_column = data_args.prompt_column
+    response_column = data_args.response_column
+    history_column = data_args.history_column
+    
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+
+    def preprocess_function_eval(examples):
+        inputs, targets = [], []
+        for i in range(len(examples[prompt_column])):
+            if examples[prompt_column][i] and examples[response_column][i]:
+                query = examples[prompt_column][i]
+                if history_column is None or len(examples[history_column][i]) == 0:
+                    prompt = query
+                else:
+                    prompt = ""
+                    history = examples[history_column][i]
+                    for turn_idx, (old_query, response) in enumerate(history):
+                        prompt += "[Round {}]\n问：{}\n答：{}\n".format(turn_idx, old_query, response)
+                    prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+                inputs.append(prompt)
+                targets.append(examples[response_column][i])
+
+        inputs = [prefix + inp for inp in inputs]
+        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, truncation=True, padding=True)
+        labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
+
+        if data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    def preprocess_function_train(examples):
+        max_seq_length = data_args.max_source_length + data_args.max_target_length
+
+        model_inputs = {
+            "input_ids": [],
+            "labels": [],
+        }
+        for i in range(len(examples[prompt_column])):
+            if examples[prompt_column][i] and examples[response_column][i]:
+                query, answer = examples[prompt_column][i], examples[response_column][i]
+
+                if history_column is None:
+                    prompt = query
+                else:
+                    prompt = ""
+                    history = examples[history_column][i]
+                    for turn_idx, (old_query, response) in enumerate(history):
+                        prompt += "[Round {}]\n问：{}\n答：{}\n".format(turn_idx, old_query, response)
+                    prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+
+                prompt = prefix + prompt
+                a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
+                b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
+
+                if len(a_ids) > data_args.max_source_length - 1:
+                    a_ids = a_ids[: data_args.max_source_length - 1]
+
+                if len(b_ids) > data_args.max_target_length - 2:
+                    b_ids = b_ids[: data_args.max_target_length - 2]
+
+                input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
+
+                context_length = input_ids.index(tokenizer.bos_token_id)
+                mask_position = context_length - 1
+                labels = [-100] * context_length + input_ids[mask_position+1:]
+                
+                pad_len = max_seq_length - len(input_ids)
+                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
+                labels = labels + [tokenizer.pad_token_id] * pad_len
+                if data_args.ignore_pad_token_for_loss:
+                    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
+
+                model_inputs["input_ids"].append(input_ids)
+                model_inputs["labels"].append(labels)
+
+        return model_inputs
+    
+    def print_dataset_example(example):
+        print("input_ids",example["input_ids"])
+        print("inputs", tokenizer.decode(example["input_ids"]))
+        print("label_ids", example["labels"])
+        print("labels", tokenizer.decode(example["labels"]))
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function_train,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
+        print_dataset_example(train_dataset[0])
+
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function_eval,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+        print_dataset_example(eval_dataset[0])
+
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                preprocess_function_eval,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
+        print_dataset_example(predict_dataset[0])
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=None,
+        padding=False
+    )
+
+    # Metric
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        if data_args.ignore_pad_token_for_loss:
+            # Replace -100 in the labels as we can't decode them.
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        score_dict = {
+            "rouge-1": [],
+            "rouge-2": [],
+            "rouge-l": [],
+            "bleu-4": []
+        }
+        for pred, label in zip(decoded_preds, decoded_labels):
+            hypothesis = list(jieba.cut(pred))
+            reference = list(jieba.cut(label))
+            rouge = Rouge()
+            scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
+            result = scores[0]
+            
+            for k, v in result.items():
+                score_dict[k].append(round(v["f"] * 100, 4))
+            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
+            score_dict["bleu-4"].append(round(bleu_score * 100, 4))
+
+        for k, v in score_dict.items():
+            score_dict[k] = float(np.mean(v))
+        return score_dict
+
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = (
+        training_args.generation_max_length
+        if training_args.generation_max_length is not None
+        else data_args.val_max_target_length
+    )
+    training_args.generation_num_beams = (
+        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
+    )
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+        save_prefixencoder=model_args.pre_seq_len is not None
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        # elif last_checkpoint is not None:
+        #     checkpoint = last_checkpoint
+        model.gradient_checkpointing_enable()
+        model.enable_input_require_grads()
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        # trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate(metric_key_prefix="eval", do_sample=True, top_p=0.7, max_length=512, temperature=0.95)
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict", max_length=512, do_sample=True, top_p=0.7, temperature=0.95)
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                labels = tokenizer.batch_decode(
+                    predict_results.label_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                labels = [label.strip() for label in labels]
+                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
+                with open(output_prediction_file, "w", encoding="utf-8") as writer:
+                    for p, l in zip(predictions, labels):
+                        res = json.dumps({"labels": l, "predict": p}, ensure_ascii=False)
+                        writer.write(f"{res}\n")
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ptuning/train.json b/ptuning/train.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf24b9a3fc2fe7a93ee944d4e2a81c0495b8cd28
--- /dev/null
+++ b/ptuning/train.json
@@ -0,0 +1,100 @@
+{"content": "西宁市城中区人民检察院指控:2016年9月初,被告人马某、黄4某、何某、韦某、庄某(五人均被另案起诉)、黄3某(另案处理)、黄1某等人明知电信诈骗人员(具体身份不详)实施诈骗犯罪,为获取报酬,事前为电信诈骗人员(具体身份不详)提供银行账户,在钱款到账后帮助电信诈骗人员(具体身份不详)支取诈骗钱款。参与犯罪事实如下:\n2016年9月8日11时,青海河海水利水电设计有限公司出纳陈某通过自用手机登陆后加入名称为水利水电设计高层的QQ群(群号为309184009),群中一名名称显示为公司总经理隆某的人加其为好友,后陈某被该名称为隆某的人告知因出差不方便打电话需要在QQ上安排工作事宜,其让陈某电话联系了一个叫张某的人,以需要支付合同保证金为由,让陈某将公司账户中的89万元分九次转账至张某的农业银行账户(账号×××)。当日14时许,QQ名为隆某的人再次让陈某电话联系了一个名为王某的人,以借款为由,让陈某将公司账户中的50万元分五次转账至给王某账户(账号×××)。经查,陈某转账至张某账户中的89万元于当日又被转至被告人韦某持有的名为李2某的账户(账号×××),89万元被被告人韦某、何某、黄4某、马某(四人均被另案起诉)、黄3某(另案处理)等人控制并支取60万元。转至王某账户内的50万元于当日又被转至名为庄某的银行账户(账号×××),当日,庄某(已另案起诉)在被告人黄1某的安排下,将50万元钱款支取后交给了黄1某。\n针对上述指控,公诉机关当庭出示了报案材料及被害人的陈述、证人证言、被告人供述、扣押物品清单、扣押笔录、银行交易记录、通话记录、破案经过、辨认笔录、情况说明、户籍证明等证据,以证实其指控的犯罪事实。\n该院认为,被告人黄1某伙同庄某(另案起诉)等人,明知电信诈骗人员实施诈骗,事前商议取款报酬并提前为他人提供银行账号,后帮助将诈骗钱款转移并支取,涉案金额达50万元,数额特别巨大,其行为已触犯《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。提请本院依法判处。\n", "summary": "根据刑法第266条，判处黄1某诈骗罪"}
+{"content": "四川省宜宾市翠屏区人民检察院指控:2000年12月,被告人黄某从四川省长宁县建委调到长宁竹某管理局工作,先后任竹某管理局规划建设处副处长、处长。2008年6月,被告人黄某调到宜宾市临港经济技术开发区工作,先后担任宜宾市开发投资公司董事长、宜宾市临港经济技术开发区发展策划投资服务局副局长、宜宾临港建设有限公司总经理等职。\n被告人黄某在担任上述职务期间,为他人谋取利益,非法收受他人贿赂共计22万某,具体犯罪事实如下:\n一、2002年-2005年期间,黄某以长宁卧龙建司、古典建司名义承建了长宁竹某墨溪路面、翡翠长廊、停车场、游山道、游客中心等工程。期间,黄某为了感谢被告人黄某在施工、结算工作中的关照,先后三次共送给被告人黄某现金7万某。被告人黄某非法收受后用于日常开支。\n二、2006年初,邹某以长宁兴宁某名义承建了长宁竹某风景区房屋立面改造工程。工程完工后,邹某为了感谢被告人黄某在施工过程中的关照,送给黄某现金3万某。被告人黄某非法收受后用于日常开支。\n三、2007年-2010年期间,蒋某以四川华蓥建工集团等公司名义承建了宜宾市白某工业园龙顺花园小区、宜宾临港经济技术开发区第二办公区装修改造工程。业主方系宜宾市开发投资公司。2008年6月,被告人黄某任宜宾市开发投资公司董事长后,蒋某多次找到被告人黄某帮忙,希望在施工、结算上给与关照,先后四次共送给被告人黄某现金10万某。被告人黄某非法收受后用于日常开支。\n四、2015年8月,被告人黄某兼任宜宾临港经济技术开发区项目运营管理中心项目策划部总经理。2015年底,项目运营管理中心以宜宾临港建设有限公司名义为项目运营中心策划部招录一名工作人员。李某1报考了该岗位并进入面试,李某1的父亲李某2为了让李某1顺利被录取遂找到刘某帮忙,刘某找到被告人黄某帮忙,通过被告人黄某的帮助,李某1顺利被录取上班。2016年初,李某2为了感谢黄某帮助,通过刘某送给被告人黄某现金1万某。被告人黄某非法收受后用于日常开支。2016年1月底,被告人黄某任临港建设有限公司总经理,4月,李某1调到临港建设有限公司设计部工作。李某1的父亲李某2希望被告人黄某在工作上多关照李某1,于2016年8月再次通过刘某送给被告人黄某现金1万某。被告人黄某非法收受后用于日常开支。\n2017年6月1日23时许,被告人黄某接受本院反贪局电话通知后主动到案,并如实供述了本院反贪局尚未掌握的上述犯罪事实。\n案发后,被告人家属代其退缴了上述犯罪所得。\n", "summary": "根据刑法第385条第383条第386条，判处黄某受贿罪"}
+{"content": "经二审审理查明,2016年9月10日上诉人李某以承包工程需要用车为由,在彭阳县城与宁夏德洋汽车租赁有限公司签订了《车辆租赁合同》,约定每辆车每天支付租金260元,租赁了???号?北京现代?越野车1辆。2016年12月中旬的一天,上诉人李某向苟某某的表弟借款,并私自将租赁的???号?北京现代?越野车质押给了苟某某的表弟。\n2016年10月2日上诉人李某又在宁夏德洋汽车租赁有限公司租赁???号?丰某?轿车1辆,2016年11月29日,上诉人李某在固原市原州区向申某某借款,并私自将租赁的???号?丰某?轿车质押给了申某某。\n2016年12月2日上诉人李某又在宁夏德洋汽车租赁有限公司租赁???号?本田雅阁?轿车1辆,当日在固原市原州区向李某甲借款,并私自将租赁的???号?本田雅阁?轿车质押了给李某甲。\n租赁期间,上诉人李某给宁夏德洋汽车租赁有限公司支付3辆车的租金共计2.85万元。租赁期限届满后,上诉人李某未按约定归还租赁车辆,并为躲避催收,于2017年3月14日将其预留的?158XXXXXXXX?手机号码办理了停机业务。宁夏德洋汽车租赁有限公司多次联系上诉人李某要求归还租赁车辆未果后,遂报警。案发后,涉案车辆已全部被追回,并发还宁夏德洋汽车租赁有限公司。经鉴定,???号?北京现代?越野车价值5.6万元、???号?丰某?轿车价值9万元、???号?本田雅阁?轿车价值5.8万元。\n另查明,二审法院审理期间,上诉人李某家属代为赔偿宁夏德洋汽车租赁有限公司经济损失,宁夏德洋汽车租赁有限公司对上诉人李某的犯罪行为予以谅解。\n上述事实有经一二审举证、并经质证的下列证据予以证实:\n1.上诉人李某身份信息,证明上诉人李某身份信息,犯罪时达到刑事责任年龄的事实。\n2.受案登记表、立案决定书,证明张某某报案称其将三辆车出租给李某,车辆下落不明,也联系不到李某,2017年1月31日公安机关决定受理初查,2017年2月23日决定以××立案侦查的事实。\n3.归案情况说明、临时羁押证明,证明2017年3月29日,兰州铁路公安局银川公安处刑警支队在宁夏银川市??区星期八烧烤店内将在逃人员李某抓获。2017年3月30日至2017年3月31日临时羁押于银川市看守所的事实。\n4.车辆租赁合同3份、上诉人的驾驶证、身份证复印件,证明2016年9月10日宁夏德洋汽车租赁有限公司出租给李某???现代越野车1辆,租赁期限至2016年11月30日,每天租赁费260元。2016年10月2日宁夏德洋汽车租赁有限公司出租给李某???丰某轿车,租赁期限至2016年11月30日,每天租赁费260元。2016年12月2日宁夏德洋汽车租赁有限公司出租给李某???本田雅阁轿车1辆,租赁期限至2016年12月4日,每天租赁费260元的事实。\n5.机动车登记证书复印件3份,证明???北京现代越野车出厂日期为2010年2月27日,???本田雅阁轿车出厂日期为2006年2月15日,???丰某轿车出厂日期为2009年12月15日,三辆机动车所有人是张某某的事实。\n6.借条2份、收条、二手车交易协议书,证明2016年11月29日李某向申某某借款25000元,借款期限至2016年12月8日。2016年11月29日李某与申某某签订???丰某轿车买卖协议的事实。\n7.彭阳县移动公司西门营业厅证明,证明2017年3月14日客户李某移动手机号码158XXXXXXXX在移动公司办理停机业务的事实。\n8.宁夏德洋汽车租赁有限公司营业执照、授权委托书、杨某某、张某某身份证复印件及该公司出具的证明,证明宁夏德洋汽车租赁有限公司2014年4月28日成立,法定代表人杨某某,公司类型有限责任公司,住所地彭阳县城兴彭路职中右侧,宁夏德洋汽车租赁有限公司委托张某某办理李某租车未归还的事务。同时证明9月14日至今李某给付宁夏德洋汽车租赁有限公司租车费28500元的事实。\n9.扣押决定书、扣押笔录、扣押清单、发还清单及照片,证明公安机关从李某甲处扣押???本田雅阁轿车1辆,从申某某处扣押???丰某轿车1辆,从李某乙处扣押???现代越野车1辆,3辆车均发还给张某某的事实。\n10.彭阳县价格认证中心-彭价认定(2017)08号、14号价格认定结论书、鉴定意见通知书,证明经彭阳县价格认证中心鉴定???现代越野车价格为56000元、???本田雅阁轿车价格为58000元、???丰某轿车价格为90000元,该鉴定意见送达上诉人李某及张某某的事实。\n11.刑事和解谅解书、收条,证明上诉人李某家属代为赔偿宁夏德洋汽车租赁有限公司经济损失2万元,宁夏德洋汽车租赁有限公司对上诉人李某的犯罪行为予以谅解的事实。\n12.委托调查函、拟适用社区矫正前社会调查评估表,证明2018年3月16日经固原市原州区社区矫正工作领导小组评估,上诉人李某适合社区矫正的事实。\n13.证人杨某某证言,证明其系宁夏德洋汽车租赁公司的法定代表人,李某2016年6月26日开始在其公司租赁车辆,先后租赁过六七辆,共计给其公司交纳过80500元租赁费。李某后来租赁的???现代越野车、???本田雅阁轿车、???丰某轿车每辆车每天租赁费为260元,2016年9月份以后付过六七次租赁费,也就是28000元左右的事实。\n14.证人张某某证言,证明2016年9月10日李某以工程队用车为由租赁了张某某的???现代越野车,双方签订了合同,期限至2016年11月30日。2016年10月2日,李某又以工程队用车为由,租赁了张某某的???本田雅阁轿车,约定11月30日归还。2016年12月2日,张某某又给李某出租了???丰某轿车,李某承诺用两天就归还。直到2016年12月4日,张某某向李某催要三辆车和租金,李某称其在银川等着结工程款,但经张某某多次催要后李某手机停机了。2017年1月26日,张某某在固原发现了其出租给李某的???丰某轿车,才知道李某以自己名义将该车抵押了。租赁车时均签订了车辆租赁合同,???价值8万元,???价值7万元,???价值11万元。三辆车均登记在张某某名下,张某某将这三辆车放在宁夏德洋汽车租赁有限公司挣取租车费,张某某与宁夏德洋汽车租赁有限公司口头协商,每辆车租出去每天最少收取200元费用的事实。\n15.证人李某乙证言,证明其系李某的父亲,苟某某说李某借了他20000元,2017年4月26日李某乙给苟某某给了10000元,赎回了李某抵押出去的???黑色北京现代越野车一辆,并将该车交给公安机关的事实。\n16.证人贾某某证言,证明其在固原市会议服务中心工作,该中心2015年11月份建成,2016年1月份投入使用。广东开平市建筑工程有限公司承包了固原市会议服务中心一标段内装修,李某曾分包了固原市会议服务中心第一层至第三层的电照装修,从2014年后半年干到2015年后半年,广东开平市建筑工程有限公司大概欠李某工程款20万元左右的事实。\n17.证人申某某证言,证明2016年11月29日17时许,李某给申某某打电话,以李某父亲要去西安医院做手术为由向申某某借款,承诺等工程款下来后就能归还,并将李某的车作为抵押,申某某便答应了。李某在固原市妇幼保健院将一辆???黑色丰田轿车交给申某某,申某某将1500现金交给李某,申某某通过手机银行向李某提供的尾号为1495的农村信用社账号转入23500元,共借给李某25000元,李某出具借条一张,同时承诺如果还不上钱,就将该车过户给申某某的事实。\n18.证人李某甲的证言,证明2016年12月2日,李某找到李某甲口头协议借款40000元,并提供???黑色本田雅阁轿车作为抵押,李某承诺一周归还,李某甲通过现金的方式给了李某40000元。到了第五天,李某甲给李某打电话索要欠款,但电话一直没有打通的事实。\n19.上诉人李某的供述与辩解,证明2016年9月10日其在彭阳县的宁夏德洋汽车租赁公司租了一辆黑色???现代越野车,2016年10月2日在宁夏德洋汽车租赁公司租了一辆黑色???丰某轿车,2016年12月2日在宁夏德洋汽车租赁公司租了一辆黑色???广州本田雅阁轿车,均签订了车辆租赁合同,每辆车每天租赁费260元,租赁期限没有约定。\n2016年11月29日,其在固原市妇幼保健院内将???丰某轿车以25000元抵押给了申某某,并出具了借条,与申某某签订了二手车交易协议,承诺2016年12月8日还款并取回轿车,同时约定如果按期还不上钱,申某某可以将抵押的车辆卖掉。申某某通过手机银行转账方式将22500元转到李某农村信用社银行卡上,扣了2500元利息。\n2016年12月2日在固原市南河滩附近将???黑色广本雅阁轿车以20000元抵押给了李某甲,给李某甲出具了40000元借条,李某承诺2016年12月12日给李某甲还20000元并取回轿车,李某甲用微信转账的方式转了18500元,收取了1500元利息。\n2016年12月中旬的一天,其在固原市??区一酒吧内将???黑色现代越野车以16000元抵押给苟某某的表弟,并出具借条16000元,其承诺2016年12月底归还16000元并取回越野车,苟某某的表弟给了李某16000元现金。\n刚开始租赁的???现代越野车确实是工程上用,后因资金紧张,就将该车抵押了。后面租的???和???轿车的目的就是为了将车租来给别人抵押,向别人借钱,用于付租车费、给工人发放工资、工程周转、清利息和自己平时日常消费。等工程款结了,李某就可以给所有人还钱,并可以把车赎回来。因为工程上用不了这么多车,所以李某没有能力履行车辆租赁合同。李某给宁夏德洋汽车租赁公司支付租赁费约20000元,2016年12月5日以后就再没有付过租赁费。为了让宁夏德洋汽车租赁公司的人和李某欠款的人联系不上李某,李某就去移动公司对其158XXXXXXXX办理了半年报停。因为没有钱去赎车,所以不能按时归还车辆的事实。\n以上证据来源合法、内容客观真实、与本案具有关联性,证据间能够相互印证,证据确实、充分,能够客观证明本案犯罪事实。二审中,上诉人李某的辩护人向法庭提交了刑事和解谅解书、收条,经出庭履行职务的检察人员质证,并经庭外调查核实,该证据符合证据的?三性?,可以作为定案的依据,本院对一二审认定的证据予以确认。\n", "summary": "根据刑法第224条，判处)李某合同诈骗罪"}
+{"content": "山东省临沭县人民检察院指控:\n2017年12月10日,被告人骆某荣因家庭琐事与被害人张某亲属产生矛盾,被告人骆某荣为报复泄愤,遂纠集骆某(另案处理)等人来到临沭县石门镇X村,持木棍将被害人张某、于某红、张某军、张某彩打伤。经鉴定,被害人张某的损伤构成轻伤二级,被害人于某红、张某军、张某彩的损伤构成轻微伤。\n公诉机关认为,被告人骆某荣故意伤害他人身体,致一人轻伤二级、三人轻微伤,其行为触犯《中华人民共和国刑法》××××之规定,应当以××追究其刑事责任。\n刑事附带民事原告人张某提出如下要求:1、依法追究被告人××的刑事责任,从重处罚。2、判令被告人赔偿原告各项损失共计5万元。\n刑事附带民事诉讼原告人张某军提出如下要求:1、依法追究被告人××的刑事责任,从重处罚。2、判令被告人赔偿原告各项损失共计2万元。\n刑事附带民事诉讼原告人张某彩提出如下要求:1、依法追究被告人××的刑事责任,从重处罚。2、判令被告人赔偿原告各项损失共计2万元。\n刑事附带民事诉讼原告人于某红提出如下要求:1、依法追究被告人××的刑事责任,从重处罚。2、判令被告人赔偿原告各项损失共计4万元。\n", "summary": "根据刑法第234条，判处骆某荣故意伤害罪"}
+{"content": "公诉机关指控,2016年3月22日19时许,临沭县青云镇白旄西街张某到其大伯哥刘某乙兴家查看婆婆去世的买菜账本,与张某的侄子刘某甲发生争执,后被被告人刘某甲殴打致伤,同年4月8日,被害人张某报案至临沭县公安局白旄派出所,同年4月22日经法医鉴定:张某之损伤构成轻伤二级。\n公诉机关认为,被告人刘某甲故意伤害他人身体,致人轻伤;其行为触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第234条，判处刘某甲故意伤害罪"}
+{"content": "公诉机关指控,2015年至2017年,山东尚信达家具制造有限公司先后拖欠张某等56名职工工资1262956元。2017年8月17日,临沭县人力资源和社会保障局向山东尚信达家具制造有限公司送达《劳动保障监察责令改正指令书》,责令其支付上述56名职工2015年至2017年的工资1262956元,但该单位逾期拒不支付。经调查,2016年被告人刘某甲在拖欠工人工资的情况下,伙同其妻张6某(在逃)在临沭印象城为其子刘某乙购买住房一套,已付首付款92724元;2017年7月份,被告人刘某甲将其瓯龙现代城一套房子以34万元的价格出售,所得款未全用于支付工人工资;2017年7月25日,被告人刘某甲在未支付工人工资的情况下擅自将山东尚信达家具公司的机器、设备、原料等以700万元的价格转让给云某。2018年1月18日,藏匿于上海普陀区的被告人刘某甲被公安机关通过上网追逃的方式抓获。\n公诉机关认为,被告人刘某甲拒不支付工人工资,其行为触犯了《中华人民共和国刑法》第××××之规定,应当以××追究其刑事责任。\n被告人刘某甲对公诉机关指控事实及罪名无异议,自愿认罪。\n", "summary": "根据刑法第276条，判处刘某甲拒不支付劳动报酬罪"}
+{"content": "临沭县人民检察院指控:2013年9月至2017年5月,临沭县锦某种植专业合作社、临沂信丰农业科技有限公司、山东昌利投资有限公司法人代表兼总经理王3某(已判刑)伙同被告人孙某、孙3某(另案处理)、马2某(另案处理)等人通过给予高于银行的存款利息等方式在临沭县临沭街道、大兴镇、店头镇、玉山镇、白旄镇、石门镇、曹庄镇等地设立分社、分公司非法向社会公众吸收存款,被告人李某甲在临沭县锦某种植专业合作社石门分社任职期间,吸收的尚未兑付存款共计3903930元,被告人孙某2014年11月从临沭县锦某种植专业合作社辞职,孙某任职期间,该合作社共吸收存款29247655.64元,其中未兑付8000元,孙某非法获利217000元。\n为证明上述犯罪事实,公诉机关当庭分别讯问了各被告人,并宣读和出示了下列证据:受案登记表、股东会决议、合作社资金互助单、定期股金统计分析报表、证人陈某甲、陈1某等人的证言、同案人供述、各被告人的供述与辩解等。\n公诉机关认为,被告人孙某、李某甲违反国家金融管理法律规定,向社会公众吸收资金,数额巨大,其行为均触犯了《中华人民共和国刑法》××××之规定,应当以××追究其刑事责任。\n", "summary": "根据刑法第176条，判处李某甲孙某非法吸收公众存款罪"}
+{"content": "山东省临沭县人民检察院指控,2012年5月29日,临沭县人民法院对申请执行人刘某甲与被执行人临沭县蛟龙镇后利城村村民委员会建设工程合同纠纷一案立案执行,同年6月6日,查封了被执行人所有的位于其村西的围岭渠一处,查封期限为二年,同日将执行裁定书送达村委成员龙某。\n2015年2、3月份,刘某甲的合伙人尹6某持后利城村村民委员会出具的欠款单据找到被告人谢某、胡某,称围岭渠被法院查封到期了,要求重新发包,以偿还村欠其工程欠款。谢某、胡某未核实查封是否到期,也未通知临沭县人民法院,擅自将被查封的围岭渠发包,共收取承包费12.2万元,同年3月3日向尹6某支付工程欠款8万元。\n公诉机关认为,被告人谢某、胡某变卖已被司法机关查封的财产,情节严重,其行为触犯了《中华人民共和国刑法》××之规定,应当以非法处置查封的财产罪追究其刑事责任。\n", "summary": "根据刑法第314条，判处谢某胡某非法处置查封、扣押、冻结的财产罪"}
+{"content": "寿县人民检察院指控:一、××\n1、2016年10月8日夜,被告人李1某、孙某驾车至寿县瓦埠镇上奠村瓦房组,将稻田内的一台变压器拆卸后盗走铜芯。之后,李1某、孙某又驾车至寿县瓦埠镇供电所,将供电所门口放置的变压器拆卸后盗走铜芯。\n2、2016年11月18日夜,被告人李1某、孙某驾车至寿县瓦埠镇金源食品厂附近,将金源食品厂对面一个水泥台上的变压器拽下拆卸后盗走铜芯。\n3、2016年11月16日夜,被告人李1某、孙某驾车至寿县小2某镇老三岗村部附近,将村部旁变电房顶上的变压器拆卸后盗走铜芯。\n4、2016年12月3日夜,被告人李1某、孙某驾车至寿县刘2某曙光路,将曙光路旁的一台变压器拆卸后盗走铜芯。\n5、2016年12月某夜,被告人李1某、孙某驾车至寿县炎刘镇格义新能源公司附近,将格义新能源公司旁的一台变压器拆卸后盗走铜芯。\n6、2017年2月9日夜,被告人李1某、孙某驾车至寿县炎刘镇新桥大道耐力特厂附近,将该厂南侧一台变压器拆卸后盗走铜芯。\n7、2017年3月29日夜,被告人李1某、孙某驾车至寿县双庙集镇周岗村,将该村李4某超市对面的一台变压器拆卸后盗走铜芯。\n8、2017年4月6日夜,被告人李1某、孙某驾车至安徽省肥东县长临河镇洪葛村村委会附近,将该村委会钱的一台变压器拆卸后,将变压器铜芯及链接变压器的部分铜制电缆线盗走。\n以上李1某、孙某采用破坏性手段盗窃,盗窃犯罪金额总计151993元。\n二、故意毁坏财物罪\n2016年11月20日的某夜,被告人李1某、孙某驾车至寿县瓦埠镇铁佛村排灌站,将排灌站水泥台上的变压器拽下拆卸,意欲盗窃,因变压器芯为铝制,李1某、孙某未予窃取。该变压器因被拆卸遭到破坏,完全失去使用价值。经鉴定:该变压器价值11400元。\n三、掩饰、隐瞒犯罪所得罪\n2016年10月8日至2017年4月7日,被告人李1某孙某在每次盗窃后,均驾车至被告人李某某在合肥市瑶海区和平路的废品收购点,将盗窃的赃物全部向李某某销赃。李某某每次明知其收购的铜芯和电缆为李1某、孙某犯罪所得,仍然以低价全部收购,共计收购铜芯等赃物重量在1700公斤以上,付给李1某孙某赃款共计在24000元以上。\n针对指控的上述事实,公诉机关提供了相关证据。公诉机关认为,被告人李1某、孙某以非法占有为目的,多次流窜作案,采取破坏性手段盗窃,数额巨大,其行为触犯了《中华人民共和国刑法》××,犯罪事实清楚,证据确实、充分,应当以××追究二被告人刑事责任;被告人李1某、孙某故意毁坏公私财物,数额较大,其行为触犯了《中华人民共和国刑法》××,应当以故意毁坏财物罪追究二被告人刑事责任;被告人李某某明知是犯罪所得而予以收购,其行为触犯了《中华人民共和国刑法》××××,应当以掩饰、隐瞒犯罪所得罪追究其刑事责任;被告人李1某系累犯,应当从重处罚。\n被告人李1某对公诉机关指控的犯罪事实、罪名和证据无异议;被告人孙某对公诉机关指控的犯罪事实、罪名和证据无异议,其辩护人辩护意见:对孙某犯××不持异议,犯故意毁坏财物罪不能成立,应认定为盗窃未遂,孙某具有立功情节,当庭认罪,系初犯,建议对其从轻处罚;被告人李某某对公诉机关指控的犯罪事实、罪名和证据无异议,其辩护人辩护意见:对定性无异议,被告人李某某积极退赃,具有悔罪表现,有坦白情节,建议对其从轻处罚并适用××。\n", "summary": "根据刑法第312条第264条，判处李某某李1某孙某盗窃罪"}
+{"content": "公诉机关指控,一、2014年11月28日12时46分许,山东省临沂市兰山区人民法院执行局工作人员张某、刘某等人驾驶鲁Q?????警车到位于临沭县城北外环魏某乙的金山会所,对拒不履行生效的(2014)临兰民初字第2193号民事判决书的被告人魏某甲实施司法拘留。为逃避执行,魏某甲从会所院内驾驶鲁Q?????黑色上海大众途锐轿车,采取撞击的方式将停在院门口的警车强行逼挤至院外,致警车的保险杠损坏,后魏某甲驾车逃窜。执法人员令其父亲魏某乙上车,到金山化肥厂寻找魏某甲。车辆行驶至该公司门口时遇到魏某甲,执法人员遂驾车追赶,并令魏某乙电话联系魏某甲。魏某甲在得知其父在执法车辆上,遂电话联系了被告人周某,被告人王2某、李某也闻讯赶来。在327国道上,魏某甲驾驶鲁Q?????大众途锐轿车,周某驾驶鲁Q?????帕某轿车,王2某驾驶鲁Q?????丰某越野车,李某驾驶黑色别克轿车追逐、围困正在行驶中的警车,并多次采取冲撞、轧车的方式拦截执法警车,后将警车逼停。执法人员令魏某乙下车,魏某乙上魏某甲驾驶的车辆后,魏某甲将车头调转,欲对撞警车,因被魏某乙拨动方向盘未果。后魏某甲、周某、王2某、李某驾车离开现场。经鉴定,东风风行菱智(鲁Q?????)损失共计410元。\n二、2014年10月1日至2014年11月10日,被告人魏某甲在未取得采砂许可手续的情况下,雇佣他人在临沭县沭河河道朱村段非法开采黄某。经测量,被告人魏某甲雇佣他人非法开采的黄某达42000余方,价值1260000元。\n公诉机关认为,被告人魏某甲以暴力、威胁方式阻碍国家工作人员依法执行职务,其行为触犯了《中华人民共和国刑法》××的规定,应当以××追究其刑事责任。被告人魏某甲违反矿产资源法的规定,未取得采矿许可证擅自采矿,情节特别严重,其行为触犯了《中华人民共和国刑法》××××的规定,应当以××追究其刑事责任。\n", "summary": "根据刑法第277条第343条，判处魏某甲妨害公务罪"}
+{"content": "经审理查明,2017年10月10日18时左右,被告人张某酒后驾驶牌号为川Q?????二轮摩托车从宜宾市翠屏区牟坪镇牟坪村5组35号家中出发,前往宜宾市翠屏区牟坪镇派出所办事,被办案民警发现被告人张某饮酒驾驶机动车,即对张某进行了呼气式酒精检测,检出酒精含量268mg/100mL,后依法对张某进行了血样提取并送检,血液中检出乙醇成分浓度为173.30mg/100mL。\n上述事实,被告人张某在开庭审理中亦无异议,且有受案登记表、立案决定书、受理道路交通事故案件登记表、查获经过,机动车行驶证、驾驶人信息查询结果单,人员指纹卡,酒精含量测试报告,血样提取登记表,(川)公(宜)鉴(法化)字【2017】1231号法化检验意见书,道路交通事故现场图、事故照片,被告人张某供述和辩解及户籍资料等证据材料予以证实,足以认定。\n", "summary": "根据刑法第133条，判处张某危险驾驶罪"}
+{"content": "经审理查明,2018年1月25日8时30分左右,被告人刘1某在翠屏区自强里中段,见被害人杜某将手机放在外衣包内,便趁其不备,用随身携带的镊子将手机盗走,并在逃跑过程中将手机丢弃。经依法鉴定,被盗香槟色?海信?牌手机价值1161元。\n2018年1月25日13时左右,被告人刘1某到公安机关投案自首。\n上述事实,被告人刘1某在开庭审理中亦无异议,且有接处警登记表、受案登记表、立案决定书、到案经过,被害人杜某的陈述,现场监控录像,指认笔录,价格鉴定意见书,四川省泸州市江阳区人民法院(2016)川0502刑初147号刑事判决书、刑满释放证明,被告人刘1某的供述和辩解及其人口信息资料等证据材料证实,足以认定。\n", "summary": "根据刑法第264条，判处刘1某盗窃罪"}
+{"content": "经审理查明,2018年1月2日19时30分左右,被告人唐1某驾驶不符合其驾驶证载明准驾驶车型类型的川Q?????小型轿车,从宜宾市长宁县往宜宾市翠屏区牟坪镇方向行驶,在新宜长路李端镇板栗村二组小地名?马儿扁?处,与行人罗某相撞,造成罗某当场死亡。事故发生后唐1某驾驶川Q?????小型轿车逃离现场。公安机关通过监控录像锁定肇事车辆QA581E小型轿车,并通过车主锁定被告人唐1某后,电话通知其到公安机关,唐1某接到电话后到公安机关如实供述其作案事实。\n经交警部门认定:唐1某在此事故中承担全部责任,罗某在此事故中无责任。\n另查明,被害人家属已经就本案民事赔偿部分另案提起民事诉讼。\n上述事实,被告人唐1某在开庭审理中亦无异议,且有受案登记表、立案决定书、受理交通事故案件登记表、抓获经过,道路交通事故现场勘查笔录、现场图及照片,驾驶人信息查询结果、机动车驾驶证,肇事车辆照片8张、唐1某肇事所穿衣服近照3张,尸体火化证明书,宜宾市公安局物证鉴定所法医学尸体检验报告,宜宾市公安局物证鉴定所鉴定书鉴定意见书,宜宾市公安局物证鉴定所法化检验报告检验意见书,四川中山机动车司法鉴定所司法鉴定意见书,道路交通事故认定书,证人陈某、王某、左某、徐某的证言,被告人唐1某供述和辩解及其户籍资料等证据材料予以证实,足以认定。\n", "summary": "根据刑法第133条，判处唐1某交通肇事罪"}
+{"content": "经审理查明,2017年7月31日3时20分左右,被告人黄某酒后驾驶川T?????小型轿车,从宜宾市翠屏区南岸往屏山县高场镇方向行驶,行驶至南连线凉水井路口时与罗某驾驶川Q085997的电动二轮摩托车相撞,致使车辆受损、罗某受伤。案发后罗某亲属报警,被告人黄某在现场等候交警处理。经检验,被告人黄某血液中检出乙醇成分,浓度为127.34mg/100ml。\n黄某到案后如实供述了自己的主要作案事实,并赔偿了罗某经济损失取得谅解。\n上述事实,被告人黄某在开庭审理中亦无异议,且有受理道路交通事故案件登记表、受案登记表、受案回执、立案决定书及抓获经过,道路交通事故现场勘查笔录及照片,血样提取登记表、宜宾市公安局物证鉴定所(川)公(宜)鉴(法化)字[2017]889号法化检验意见书,机动车驾驶人信息查询结果单,被害人罗某陈述,证人陈某证言,宜宾市公安局交通警察支队交管一大队公交认字[2017]第00416号道路交通事故认定书,交通事故民事赔偿协议、谅解书,被告人黄某供述和辩解及户籍资料等证据材料予以证实,足以认定。\n", "summary": "根据刑法第133条，判处黄某危险驾驶罪"}
+{"content": "经审理查明,(一)2017年7月6日凌晨,被告人代某来到宜宾市翠屏区鑫领寓小区,头戴黑色女式腿袜、手戴白色线手套,利用千斤顶撑开小区围栏,脱掉皮鞋后进入小区,在该小区10幢1单元楼下,顺着外墙上的水管爬到2楼被害罗某1全窗外,再次用千斤顶撑开其厨房防护栏进入客厅,将其家中的5000余元现金盗走。之后,代某采用同样的方法将5楼被害李某萍家中的1600余元现金盗走。\n案发后,被告人代某的家属代其退赔了被害人的损失,获得被害人谅解。\n(二)2017年8月4日凌晨,被告人代某来到宜宾市翠屏区碧峰园小区,通过该小区9幢2单元1楼的防护栏爬入2楼3号被害郭某丹家中,将其家中1000余元现金盗走,后又以同样方式进入该幢楼3单元2楼3号被害连某伟家中盗走其现金5000余元。\n案发后,被告人代某的家属代其退赔了被害人的损失,获得被害人谅解。\n被害人报案后,公安机关根据现场监控,锁定盗窃鑫领寓的被告人代某,并于2017年9月14日将其抓获,归案后被告人代某如实供述其盗窃鑫领寓的作案事实后,又主动向公安机关交代其盗窃碧峰园的作案事实。\n上述事实,被告人代某在开庭审理过程中亦无异议,且有受理报警登记表、受案登记表、受案回执、立案决定书及抓获经过,被害罗某1罗某2李某连某郭某的陈述,现场勘验检查笔录,现场及作案工具指认笔录和照片,搜查记录、扣押决定书、扣押清单、涉案财物移交清单及图片说明,本院(2016)川1502刑初483号刑事判决书、释放证明书,被告人代某的供述和辩解及户籍信息等证据证实,足以认定。\n", "summary": "根据刑法第264条，判处代某盗窃罪"}
+{"content": "四川省宜宾市翠屏区人民检察院起诉指控,2016年11月期间,被告人宋2某翠屏区民主路套路酒吧内,听闻被害人程某多次未通过驾驶证考试,便向其谎称自己能免试办理驾驶证,并先后收取程某交付的现金和微信转账用于办证费用共计10000元。后被告人宋2某为程某办理了一本假驾驶证。\n", "summary": "根据刑法第266条，判处宋2某诈骗罪"}
+{"content": "经审理查明,2017年10月14日12时50分左右,被告人韩某在车牌号为川Q3?的13路公交车上,趁被害李某琼不备之机扒窃了其放在背包内的现金400余元。\n案发后,公安机关通过监控锁定了被告人韩某,2017年11月14日,被告人韩某接到公安机关电话通知后主动投案自首,并如实供述其罪行。\n上述事实,被告人韩某在开庭审理过程中亦无异议,且有接处警登记表、受案登记表、受案回执、立案决定书、到案经过,被害人李某的陈述,调取证据通知书、监控视频、退款书,收条,(2014)翠屏刑初字第335号刑事判决书及刑满释放证明,被告人韩某的供述和辩解及户籍信息等证据证实,足以认定。\n", "summary": "根据刑法第264条，判处韩某盗窃罪"}
+{"content": "经审理查明,2017年7月25日20时左右,被告人肖某酒后无证驾驶无牌摩托车从宜宾市翠屏区老城区往宜宾市翠屏区上江北方向行驶。21时许,行驶至宜宾市翠屏区岷江桥时,被设卡检查的民警当场查获后,将其带往宜宾骨科医院抽取血样鉴定。经检验,送检肖某血液中检出乙醇成分,浓度为144.72mg/100ml。案发后,被告人肖某如实供述其主要作案事实。\n上述事实,被告人肖某在开庭审理中亦无异议,且有接处警登记表、受案登记表、立案决定书及抓获经过,取保候审决定书,血样提取登记表、宜宾市公安局物证鉴定所(川)公(宜)鉴(法化)字[2017]876号法化检验意见书,驾驶人信息查询结果单及情况说明,扣押物品清单,本院(2002)翠屏刑初字第109号刑事判决书、四川省梓潼县人民法院(2013)梓刑初字第34号刑事判决书,被告人肖某供述和辩解及户籍资料等证据材料予以证实,足以认定。\n", "summary": "根据刑法第133条，判处肖某危险驾驶罪"}
+{"content": "经审理查明,2017年9月下旬的一天晚上,被告人周某翻窗进入四川省宜宾市翠屏区长江苑小区10幢1单元1楼1号被害曹某鸿家中实施盗窃,盗走铂金钻戒2枚和电脑一台。经鉴定,被盗钻戒价值5559元,电脑价值300元。案发后被害人报案,公安机关立案侦查,并于2017年10月24日将被告人周某抓获。到案后周某如实供述了自己的犯罪事实。\n另查明,公安机关追回被盗铂金钻戒2枚和电脑一台发还被害曹某鸿。\n上述事实,被告人周某在开庭审理过程中亦无异议,且有接处警登记表、受案登记表、受案回执、立案决定书、抓获经过,情况说明,被害曹某鸿的陈述,搜查证、搜查笔录、提取笔录、扣押清单,宜宾市翠屏区价格认定中心区价认定【2017】288号?涉案钻石戒指、电脑的价格认定意见书?,物证痕迹鉴定意见书,领条,(2013)翠屏刑初字第519号刑事判决书及刑满释放证明,被告人周某的供述和辩解及户籍信息等证据证实,足以认定。\n", "summary": "根据刑法第264条，判处周某盗窃罪"}
+{"content": "经审理查明,2013年10月11日21时左右,被告人张某罗某红范某亮胡某浩徐某辉(均已判)及松某娃?(身份不详)等人在宜宾市翠屏区金沙广场?满洲红?KTV唱歌时,张某及松某娃?看见了与其有矛盾的被害王某1松在临近的?爱尚?歌厅唱歌。张某遂召集上述人员共谋伤王某1松的具体事宜。受张某指使,被告胡某浩和松某娃?找到被害王某1松所在位置。随后松某娃?电话邀约其师兄弟三人携带刀具到达KTV,众人分发刀具后一同前往被害王某1松所在歌厅。被告人张某等持刀人员冲进歌厅便王某1松砍去王某1松被砍后迅速往歌厅外逃跑罗某红见状用水壶扔王某1松徐某辉用拳头殴王某1松。王某1松跑到歌厅对面时摔倒在地后,被追赶上的张某、松某娃?及其师兄弟等人持刀乱砍,随后张某、松某娃?胡某浩徐某辉罗某红范某亮逃离现场王某1松被送往医院救治。\n经鉴定,王某1松左尺神经、左桡神经、左腓总神经损伤致左腕关节、左踝关节功能丧失50%以上均属重伤二级;王某1松左肩胛骨、左胫骨骨折均属轻伤二级;王某1松左肘部、左小腿及右中指疤痕累计长达45.5cm属轻伤一级;王某1松肩背部及腰部疤痕累计长达40.5cm属轻伤一级。\n2017年10月24日,绍兴市公安局柯桥区分局将在逃的被告人张某抓获。\n本案在审理过程中,经本院主持调解,被告人张某的家属代其赔偿被害王某1松各项经济损失共计十五万元,被告王某1松对其表示谅解。\n上述事实,被告人张某在开庭审理中亦无异议,且有立案决定书、受案登记表及公安机关抓获经过说明,被害王某1松陈述及辨认笔录,证龚某芬王某2莉杨某芬刘某湲王某3桥曾某叶的证言,同胡某浩罗某红徐某辉范某亮的供述和辩解及辨认笔录,宜宾市公安局翠屏区分局物证鉴定室公(宜翠)伤鉴(法)字[2014]061号法医学人体损伤程度鉴定书,四川省宜宾市中级人民法院(2002)宜中刑一终字110号刑事附带民事判决书及刑满释放证明书,本院(2014)翠屏刑初字第834号刑事附带民事判决书,被告人张某的户籍证明及其供述等证据证实,足以认定。\n", "summary": "根据刑法第234条，判处张某故意伤害罪"}
+{"content": "四川省宜宾市翠屏区人民检察院指控:2017年12月14日15时许,被告人李1某在翠屏区象鼻镇四华村白河组自家坝子里面与被害彭某甫发生纠纷后双方发生互殴,在互殴过程中被告人李1某用镰刀彭某甫头部砍伤彭某甫将被告人李1某面部、头部打伤。经法医鉴定彭某甫因外伤致头皮瘢痕属轻伤二级;李1某面部及头部损失均属轻微伤。\n2018年1月11日,被告人李1某主动到公安机关如实供述自己的犯罪事实。\n", "summary": "根据刑法第234条，判处李1某故意伤害罪"}
+{"content": "四川省宜宾市翠屏区人民检察院起诉指控:被告人沈2某从他人处购得海洛因用于吸食和贩卖。2017年5月以来,吸毒人员杨某通过拨打被告人沈2某的电话联系购买海洛因后,先后2次在宜宾市翠屏区以每次100元1小包向被告人沈2某购买海洛因。2017年9月30日,杨某以一部步步高手机抵押240元向被告人沈2某购买了1小包海洛因。\n2017年11月以来,吸毒人员刘某2通过电话联系被告人沈2某后,先后三次在宜宾市翠屏区以每次120元1小包向沈2某购买海洛因用于吸食。\n2017年12月9日,民警在被告人沈2某租住的女学街15号1栋1单元4楼3号查获海洛因疑似物2小包净重27.19克及吸毒工具、电子秤等物。经鉴定:从沈2某处查获的海洛因疑似物中均检出海洛因成分。\n", "summary": "根据刑法第347条，判处沈2某走私、贩卖、运输、制造毒品罪"}
+{"content": "四川省宜某市翠屏区人民检察院指控:被告人江1某已办理烟草专卖零售许可证,被告人朱1某、何1某未办理烟草专卖零售许可证。2017年以来,被告人朱1某从他人处购进假冒的云烟、玉溪、牡丹、中华、白塔山、南京、利群、中华、芙某、阿某等卷烟和少量正品红塔山恭贺新禧卷烟及少量走私的爱喜卷烟,并伙同其妻子被告人何1某一同将购买的烟加价转卖给被告人江1某,由被告人朱1某通过成都到宜某的野的司机曹某红或快递将烟送到宜某市翠屏区交给被告人江1某,被告人何1某负责记录销售情况,被告人朱1某提供了一张户名为黄某的建设银行卡给被告人何1某,用于专门向被告人江1某收取卷烟销售款。2017年7月至9月期间,被告人江1某通过银行存款的方式向被告人何1某持有的银行卡内转账共197850元用于支付购烟款,另在宜某市翠屏区人民公园后门支付被告人朱1某购烟款30000元,上述款项中有2500元是用于支付正品红塔山恭贺新禧卷烟款,有4125元是用于支付走私爱喜卷烟款。被告人江1某除向被告人朱1某、何1某购买假烟外,还向他人购买假烟,并一同放在其经营的位于宜某市翠屏区建设路134号的门市上进行零售或向杨某1、杨某2等人批发销售。被告人江1某已将从被告人朱1某、何3某购买的174010元的假烟予以销售,销售金额20万余元。\n2017年9月30日,公安机关在建设路134号门市搜出被告人江1某尚未销售的假冒硬大前门35条、软大前门50条、软经典双喜4条、软牡丹67条、硬阿某15条、硬经典红塔山13条、软珍品云烟95条、软如意云烟40条、紫云烟35条、硬芙某84条、炫赫门南京10条、新版利群14条、软中华8条、硬中华17条、细支中华11条、硬玉溪82条、软玉溪24条、细支玉8条,以上假烟购买价格共计46945元,另查获正品红塔山恭贺新禧10条,走私爱喜卷烟9条。\n审查起诉期间,暂扣被告人江1某18万元,暂扣被告人何1某25万元。\n", "summary": "根据刑法第214条，判处朱1某江1某何1某销售假冒注册商标的商品罪"}
+{"content": "西吉县人民检察院指控,2014年1月,被告人徐某从杨某甲处承包经营西吉县国升建材销售有限公司(原火石寨乡大庄砖厂)。2016年4月至8月,被告人徐某拖欠蔡某某、聂某等四十余名劳动者工资463400元。2016年8月29日,被告人徐某以出门要账为由逃匿回到其河南荥阳老家。2016年9月6日,西吉县人力资源和社会保障局以西人社劳监令字[2016]第119号劳动保障监察限期整改指令书责令被告人徐某支付拖欠的劳动者工资。被告人徐某仍拒不支付劳动者报酬。案发后,杨某甲为被告人徐某代为支付劳动者报酬107000元。被告人徐某于2017年7月2日在河南省郑州市××区交叉口西北角建设银行内被抓获归案。公诉机关就起诉的上述事实向法庭提供了书证、证人证言、被告人供述和辩解等证据。公诉机关认为,被告人徐某以逃匿的方法逃避支付劳动者报酬463400元,数额较大,经政府有关部门责令支付仍不支付,其行为已触犯《中华人民共和国刑法》××××××之规定,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。被告人徐某到案后如实供述自己的犯罪行为,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。建议判处被告人徐某一年以上三年以下××,并处罚金,可以适用××。\n", "summary": "根据刑法第276条，判处徐某拒不支付劳动报酬罪"}
+{"content": "经二审审理查明,上诉人齐某先后向被害人王某甲等30余人以借款为名骗取现金134.465万某。上诉人齐某前妻王某庚向杨某甲归还?借款?3000元,实际骗取现金1.7万某;上诉人齐某及其前妻王某庚向杨某丁归还?借款?2.5万某,支付利息2800元,实际骗取现金5200元。原审判决认定原审被告人齐某犯××的其他事实与二审查明的一致。\n上述事实有经一审庭审中公诉机关当庭举证、并经质证确认的下列证据证实:\n1.隆德县人民检察院转办函、移送材料(证据)清单及相关控告材料,证明隆德县人民检察院于2017年3月10将上诉人齐某涉嫌诈骗的控告材料转交隆德县公安局办理的事实;\n2.隆德县公安局隆公(刑)刑受案字[2017]第79号受案登记表、受案回执、立案决定书、立案告知书,证明隆德县公安局接隆德县人民检察院转办函,该函反映王某甲、刘某甲、何某某、杨某甲、马某甲等5人被上诉人齐某诈骗40余万某,隆德县公安局受理并立为刑事案件侦查的事实;\n3.隆德县公安局隆公(刑)刑受案字[2017]第213号、第215号等受案登记表,证明公安机关在办理上诉人齐某涉嫌诈骗一案中发现有何某某、张某甲、马某甲、张某乙等人被骗取数额不等的钱款,并受案初查的事实;\n4.隆德县公安局隆公(刑)立字[2017]66号、183号等立案决定书、立案告知书,证明隆德县公安局对何某某、张某甲、马某甲、张某乙等人被上诉人齐某诈骗案件,立为刑事案件进行侦查的事实;\n5.抓获经过,证明2017年9月8日,固原市原州区南关派出所民警郭某、黄圆满接110指令,在原州区政府巷源洋招待所102房间将上诉人齐某抓获的事实;\n6.接受证据材料清单及证据材料,证据材料包括上诉人齐某书写并用手机拍照,办案民警打印的创业经历及欠债经过、致社会公开信;\n7.上诉人齐某的户籍证明、违法犯罪记录,证明上诉人齐某已达到法定刑事责任年龄,且未发现有违法犯罪记录的事实;\n8.上诉人齐某的微信朋友圈部分内容截屏和在?晚安隆德?公众号广告内容截屏,证明上诉人齐某多次在朋友圈和?晚安隆德?公众号宣传德某教育、发布招聘辅导老师的消息,并把自己描述为快速记忆法训练师、高级培训师、高级指导师的事实;\n9.被害人王某甲的陈述及借条9张,证明2015年5月至2016年9月,上诉人齐某以办?德某教育?需资金周转为由,以给付王某甲高额利息为条件,先后分9次向王某甲?借款?共计26万某一直未还,王某甲得到利息约5万余元的事实;\n10.被害人何某某的陈述及借条1张,证明2016年3月8日,上诉人齐某以扩展办学(德某教育)需资金周转,以给付何某某高额利息为条件向何某某?借款?2万某一直未还,何某某共得到利息3000元的事实;\n11.被害人张某甲的陈述及借条复印件一份;证明2016年1月28日、2月23日,上诉人齐某以给辅导班教师发工资、在固原市办分校需要资金为由,以给付张某甲高额利息和为张某甲儿子提供工作为条件,先后分两次向张某甲?借款?4万某,后在张某甲的追要下,上诉人齐某归还本金2万某,剩余2万一直未还,张某甲从两次?借款?中共得到利息1400元的事实;\n12.被害人马某甲的陈述及中国农业银行宁夏分行的转账凭证3张,证明2016年8月至11月初,上诉人齐某以扩展办学需要资金周转为由,以给付高额利息为条件,先后分三次向马某甲?借款?4万某一直未还,马某甲共得到利息3900元的事实;\n13.被害人张某乙的陈述及借条复印件两份,证明2016年6月至10月,上诉人齐某以扩展办学需资金周转,还贷款、给老师发工资等为由,以给付高额利息为条件,先后三次向张某乙?借款?共计10万某一直未还,上诉人齐某向张某乙支付利息8500元的事实;\n14.被害人曹某某的陈述及借条复印件,证明2016年8月,上诉人齐某以?德某教育?办学需资金周转为由,以给付高额利息为条件向曹某某?借款?2万某,?借款?到期后,上诉人齐某准时还本付息,在取得曹某某信任后,于同年9月27日以同样手段向曹某某?借款?3万某一直未还,曹某某从两次?借款?中共得到利息2000元的事实;\n15.被害人杨某甲的陈述及借条1张,借款证明2016年9月3日,上诉人齐某以其开办多家?德某教育?办学点,需要给聘请的老师发工资,自己为创业需置换车辆为由,向杨某甲?借款?2万某,2017年5月份左右,上诉人齐某前妻向被害人杨某甲归还本金3000元,剩余本金17000元,杨某甲从?借款?中未得到利息和好处的事实;\n16.被害人王某乙的陈述及借条复印件三张,证明2016年7月至10月,上诉人齐某以在隆德县增开培训班、到北京争取教育项目等为由,以给付高额利息为条件向王某乙?借款?,在第一次?借款?到期后,上诉人齐某连本带息准时归还取得王某乙信任后,又以同样手段于2016年7月23日、8月14日、10月20日分三次向王某乙?借款?共计5.5万某一直未还,王某乙从多次?借款?中共得到利息950元的事实;\n17.被害人刘某甲的陈述及借条3张,证明2016年8月至10月,上诉人齐某以其办?德某教育?资金紧张,需要周转为由,以给付高额利息为条件向刘某甲?借款?,在两次?借款?到期后,上诉人齐某准时还本付息取得刘某甲信任后,又以同样的手段于2016年8月27日、9月15日、10月14日分三次向刘某甲?借款?共计6万某一直未还,刘某甲从多次?借款?中共得到利息4900元的事实;\n18.被害人李某甲的陈述及借条、收据、收入证明、担保义务书、身份证复印件,上诉人齐某及王某庚的身份证复印件,证明2016年8、9月份,上诉人齐某以其?草场?需要运转,?德某教育?公司需搞宣传、演出、夏令营、给老师发工资等需要周转为由,在骗取李某甲信任的同时,先后分多次向李某甲?借款?共计6.16万某一直未还,李某甲为上诉人齐某担保向白某某、刘某乙借款一分都没有还,2016年12月份,白某某、刘某乙追着李某甲要账,李某甲就找了上诉人齐某的妻子王某庚,王某庚一直拖着没有还的事实;\n19.被害人杨某乙的陈述及借款合同、借条复印件,证明2016年9、10月,上诉人齐某以办?德某教育?需资金周转,偿还信用卡上支取的现金等为由,以给付高额利息为条件向杨某乙?借款?,上诉人齐某连本带息准时归还取得杨某乙信任后,又以同样手段于2016年10月4日、10月18日分两次向杨某乙?借款?共计7万某一直未还,杨某乙从三次?借款?中共得到利息1500元的事实;\n20.被害人柳某某的陈述及借条复印件两份,证明2015年后半年,上诉人齐某与柳某某认识成为朋友后,以做饲草生意多次向柳某某借钱,柳某某从中得到分红约4万余元,在二人中止?合作?后,上诉人齐某于2016年5月至8月间两次向柳某某?借款?并准时归还,后上诉人齐某以扩大办学点要交房租费、请教育专家需花费资金等为由,以继续给付柳某某高额利息为条件,于2016年9月13日、10月17日再次向柳某某?借款?共计6万某后一直未还,以办学为名的?借款?柳某某并未得到利息的事实;\n21.被害人张某丙的陈述及借条复印件一份,证明2016年4、5月份,上诉人齐某以?德某教育?办学需要周转资金为由,以给付高额利息为条件,向张某丙?借款?,并准时还本付息,取得张某丙信任后,又以同样的手段于2016年10月7日向张某丙?借款?10万某一直未还,张某丙从两次?借款?中共得到利息500元的事实;\n22.被害人岳某某的陈述及借条复印件,证明2015年7、8月份,上诉人齐某以做饲草生意多次向岳某某借钱,并按时归还,在取得岳某某信任后,又以其办?德某教育?资金紧张,需要周转,给聘请的老师发工资为由,以给付岳某某高额利息为条件,先后于2016年1月1日、同年8月分两次(第二次1万未开具借条)向岳某某?借款?共计4万某一直未还,岳某某从多次?借款?中并未得到利息的事实;\n23.被害人卜某某的陈述及借条复印件两份、宁夏德某信息咨询有限公司营业执照复印件各一份,证明2016年2月25日、5月4日,上诉人齐某以其办?德某教育?资金紧张、需要周转为由,以给付高额利息为条件,先后两次向卜某某?借款?3万某一直未还,卜某某从两次?借款?中得到利息6200元的事实;\n24.被害人王某丙的陈述,证明2015年10月份左右,上诉人齐某以?贩草?需要资金周转为由,以给付高额利息为条件,向王某丙借款,按期还本付息后,于2016年4月又以贩草需资金周转、自已出车祸等为由向王某丙?借款?2.05万某一直未还,王某丙从?借款?中共得到利息7000元的事实;\n25.被害人石某某的陈述及隆德县人民法院民事判决书一份,证明2016年9、10月份,上诉人齐某以办?培训班?需资金周转、自已把树林点着法院要给其判刑、?倒账?等为由,以给付高额利息为条件,分三次向石某某?借款?共计8万某,后向石某某归还5万某,剩余3万某一直未还,石某某从三次?借款?中共得到利息4000元的事实;\n26.被害人马某乙的陈述及借条复印件一份,证明2016年9月25日,上诉人齐某以其办?德某教育?资金紧张,需要周转,给老师发奖金,其在观庄乡有办学点,还需在联财等乡镇扩大办学等为由,向马某乙?借款?6万某,后在马某乙的追要下归还3.82万某,剩余2.18万某一直未还,马某乙从?借款?中并未得到任何好处的事实;\n27.被害人白某某的陈述及借款借据、担保声明、个人借款合同复印件各一份,证明2015年12月份,上诉人齐某以其办学资金紧张,需要周转为由,以给付高额利息为条件,向白某某借款,在第一次借款还本付息后,于2016年10月22日以同样的手段向白某某?借款?3万某后一直未还,白某某从两次?借款?中得到利息2000元的事实;\n28.被害人张某丁的陈述及借条复印件四份,证明2016年8月份以来,上诉人齐某以其资金紧需?倒手?为由,以给付高额利息为条件向张某丁借款两次,在按期还本付息取得张某丁信任后,2016年9月21日、10月3日、10月5日、10月11日,以办远程教育、买车、举办?德某教育?资助大学生文艺晚会等为由,以给付高额利息为条件,先后向张某丁?借款?共计12万某一直未还,张某丁从多次?借款?中共得到利息9600元的事实;\n29.被害人米某某的陈述及借条、宁夏德某信息咨询有限公司组织机构代码证、李某甲身份证、李某甲收入证明复印件各一份,证明2016年10月5日,上诉人齐某以其办?德某教育?学校,资金周转困难为由,以给付高额利息为条件,向米某某?借款?3万某一直未还,米某某从?借款?中得到利息2100元的事实;\n30.被害人海某某的陈述及借条复印件两份,证明2016年9月26日、10月21日,上诉人齐某以其扩大办学、需要资金周转为由,以给付高额利息为条件,先后分两次向海某某?借款?共计2.5万某一直未还,海某某从两次?借款?中未得到利息的事实;\n31.被害人王某丁的陈述及隆德县人民法院民事判决书一份,证明2016年8月25日,上诉人齐某以其扩大办学,做饲草生意资金紧张,需要周转为由,以给付高额利息为条件,向王某丁?借款?4万某一直未还,王某丁从?借款?中未得到利息的事实;\n32.被害人刘某乙的陈述及借条、李某甲担保书、李某甲收入证明复印件各一份,证明2016年9月13日,上诉人齐某以给培训班教师发工资,扩大办学规模为由,以给付高额利息为条件,向刘某乙?借款?3万某一直未还,刘某乙从?借款?中得到利息2000元的事实;\n33.被害人张某戊的陈述及借条两份、宁夏德某信息咨询有限公司组织机构代码证、王某庚身份证、李某甲担保书、李某甲收入证明复印件各一份,证明2016年9月18日、10月2日,上诉人齐某以扩展办学点,在大庄乡办学交房租费、购买办学设备缺少资金等为由,以给付高额利息、承诺?到期感谢?为条件,分两次向张某戊?借款?共计8万某一直未还,张某戊从两次?借款?中并未得到利息的事实;\n34.被害人杨某丙的陈述及借条复印件两份,证明2015年上诉人齐某认识杨某丙并成为朋友后,多次向杨某丙借钱,并按期还本付息,在取得杨某丙信任后,又于2016年10月7日、10月12日,以其办学资金紧为由,以给付高额利息为条件,两次向杨某丙?借款?共计2万某一直未还,杨某丙从多次?借款?中共得到利息约1万余元的事实;\n35.被害人王某戊的陈述及借条复印件一份,证明2016年6月,上诉人齐某以其办学资金紧张,以给付高额利息为条件,向王某戊?借款?,在第一次?借款?还本付息得到王某戊的信任后,又于2016年8月1日,以同样的手段向王某戊?借款?2万某一直未还,王某戊从两次?借款?中得到利息1200元的事实;\n36.被害人王某己的陈述及借条复印件一份,证明2016年9月8日,上诉人齐某以其买车,需要借钱?倒手?为由,以给付高额利息为条件,向王某己夫妇(丈夫任某)?借款?2万某一直未还,王某己夫妇从?借款?中未得到利息的事实;\n37.被害人杨某丁的陈述及借条复印件两份,证明2015年11月25日至2016年1月11日,上诉人齐某以其?贩草?,扩大办学等需要资金周转为由,以给付高额利息为条件,向杨某丁分三次?借款?3.3万某,后在杨某丁的催要下,上诉人齐某及其前妻向杨某丁归还2.5万某,剩余8000元一直未还,杨某丁从三次?借款?中得到利息2800元的事实;\n38.被害人宣某某的陈述及借条复印件一份,证明2016年8月29日,上诉人齐某以其资金紧张,需借钱?倒手?(周转)为由,以不收宣某某两个孩子补课费(约700元)为条件,向宣某某?借款?1万某一直未还的事实;\n39.证人王某庚证言,证明其为上诉人齐某的前妻,两人于2016年11月24日办理离婚手续。德某教育有四个办学点,龙城世家的办学点自2015年6月开办一直保持经营,2016年开办的三贤居办学点、金某办学点、体育馆办学点只经营到2016年暑假期间,每个学生收取300到600元不等的学费,最多时同时有200名左右的学生接受辅导。其于2015年9月开始接触德某教育龙城世家办学点的管理事务,公司并没有账务。2016年11月经人讨债开始其才知晓上诉人齐某大量举债的情况,其在上诉人齐某向部分被害人?借款?时还签字担保,其给杨某归还了7000元,给杨某丁归还了2000元的事实;\n40.证人李某乙证言,证明李某乙曾为上诉人齐某开办的德某教育的代课老师,其代课2个月得到4000元报酬,在其代课期间,也承担招生、收取学生学费等工作,德某教育没有规范的收支台账,并且其中一半的代课老师都是上诉人齐某亲戚朋友家的没有经验的高三毕业学生的事实;\n41.证人齐某某证言,证明上诉人齐某系其儿子,其知晓齐某与儿媳王某庚在隆德县城开办德某教育补习班并向多人?借款?的情况,但不清楚上诉人齐某把钱花在了什么地方,听王某庚说德某教育每年纯盈利有近10万某的事实;\n42.证人李某丙证言,证明其认识被告人齐某,并承办过德某教育宣传活动的摄像业务,2016年8月,上诉人齐某在其处购买了价值8000元的演唱会门票,其中4000元未支付的事实;\n43.证人马某丙证言,证明其为上诉人齐某开办德某教育龙城世家办学点的房东,2015年6月起的第一年的房租为1.1万某,第二年的房租为1.5万某,第三年的房租为2.3万某,上诉人齐某向其借过1万某已还清,未收取利息的事实;\n44.上诉人齐某的供述和辩解,证明其于2015年6月在??县店创办了教育咨询机构,名字叫?宁夏德某教育咨询有限公司?,共花了6万某,主要业务是给小学、初中生做暑寒假学习培训。其在隆德共办了包括龙城世家A区在内的四个办学点,总投资约10万某。刚开始每月纯利润5、6千元,持续到第三个月的时候,由于借他人高利贷太多,公司开始负债经营,没有办法,其只能再借高利贷,借款都是以办?德某教育?、办学、资金周转不开的理由借的,并承诺高额利息,债主共30余人,总负债大约150万某,没有详细的账务记录,对于巨额资金的去向不能说明,2016年11月,因为欠债太多,出去到外面躲债去了,原来手机号码就没有再用。带辅导班主要讲文化课,另外讲快速记忆法,其的快速记忆法是通过自己看资料、自己摸索学,在北京跟一个老师培训过几天的事实;\n45.现场勘验检查工作记录,证明?宁夏德某信息咨询有限公司?位于??县店铺,共上下两层,一层有简单陈设和电脑显示器,二层有桌椅数张和小黑板,未见有其他贵重设备的事实;\n46.上诉人齐某及其近亲属的银行账户信息,证明上诉人齐某在隆德县农村商业银行有开户信息,账户无大额收支记录,截止2017年3月14日,账户余额为0元;上诉人齐某在??县银行无开户信息;上诉人齐某在隆德县建设银行有开户信息,账户无大额收支记录,截止2017年3月14日,账户余额为1.5元;上诉人齐某在隆德县农业银行有开户信息,账号分别为???、???无大额收支记录,截止2017年3月10日,???的账户余额为0.55元;???的账户余额为0.62元;王某庚和齐某某在上述部分银行有开户信息,但均无大额收支记录,账户仅有少量余额的事实;\n47.上诉人齐某所有的不动产情况证明,证明上诉人齐某和王某庚在隆德县不动产登记事务中心无不动产登记信息。\n以上证据来源合法、内容客观真实、与本案具有关联性,证据间能够相互印证,证据确实、充分,能够客观证明本案犯罪事实。二审中,上诉人齐某无新的证据向本院提交,本院对以上证据予以确认。\n", "summary": "根据刑法第266条，判处)齐某诈骗罪"}
+{"content": "公诉机关指控,2017年1月13日15时20分许,被告人李某甲驾驶鲁Q?????普通二轮摩托车沿沭赣线由北向南行驶,行至临沭县沭赣线玉山镇姚官庄南时,与前方顺行向东转弯李某乙驾驶的鲁Q?????小型汽车相撞,致李某乙、李某甲受伤,车辆部分受损。2017年2月10日,经临沂市公安局检验鉴定李某甲血液中乙醇含量为155.7mg/100ml。属醉酒驾驶机动车。\n", "summary": "根据刑法第133条，判处李某甲危险驾驶罪"}
+{"content": "公诉机关指控:2017年4月12日,被告人刘某甲到与其合租的被害人胡某房间找东西时,在胡某的抽屉发现一条黄金手链,后刘某甲将手链盗窃后出卖,被盗黄金手链重17.1克,价值人民币5000元。案发后,刘某甲亲属赔偿胡某损失人民币7000元,胡某出具谅解书,对刘某甲的行为表示谅解。\n", "summary": "根据刑法第264条，判处刘某甲盗窃罪"}
+{"content": "临沭县人民检察院指控:2018年4月4日22时20分许,被告人高某驾驶鲁Q?????号?猎豹?牌小型越野客车沿临沭县城夏庄西街由东向西行驶,行至夏庄西街与光明路口东处时,将行人王某蕊撞致颅脑损伤死亡。被告人高某行车未按照操作规范安全行驶,负事故的主要责任。\n公诉机关为证明上述事实提供了证人证言、书证、鉴定意见、抓获经过、户籍信息等书证,被告人的供述等。\n公诉机关认为,被告人高某违反交通运输管理法规,致一人死亡,其行为触犯了《中华人民共和国刑法》××之规定,应当以××追究其刑事责任。\n被告人高某对公诉机关指控的犯罪事实自愿认罪,未作辩解。\n", "summary": "根据刑法第133条，判处高某交通肇事罪"}
+{"content": "经审理查明,2015年11月至2016年3月,被告人王2某驾驶一黑色弯梁二轮摩托车先后来到临沭县石门镇、曹庄镇、南古街盗窃作案三起,涉案价值共计41000余元。分述如下:\n1、2015年11月29日,被告人王2某来到石门镇X村陈某秀经营的X窗帘服饰店二楼卧室,盗窃现金4000元及黄金项链一条、吊坠一个,涉案价值9713元。\n2、2015年12月3日,被告人王2某来到曹庄镇X家电车辆商场,盗窃现金1000余元。\n3、2016年3月10日,被告人王2某来到郑山街道南古街X家电商场二楼卧室,盗窃现金26000余元。\n上述事实,被告人王2某在开庭审理过程中亦无异议,并有被害人陈某秀、沈某、王某光的陈述,受案登记表、指认现场照片、公安行政处罚决定书、劳动教养决定书、刑事判决书、到案经过、户籍信息等证据证实,足以认定。\n", "summary": "根据刑法第264条，判处王2某盗窃罪"}
+{"content": "山东省临沭县人民检察院指控,2018年4月8日15时41分许,被告人陈某驾驶鲁Q?????号?解放?牌轻型普通货车行驶至临沭县青云镇李埠子村中心街时,因涉嫌酒后驾驶机动车被临沭县交警大队民警查获。2018年4月23日,经临沂市公安局刑事技术检验陈某血液中的乙醇含量为97.0mg/100ml。属醉酒驾驶机动车。公诉机关认为,被告人陈某在道路上醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××××之规定,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处陈某危险驾驶罪"}
+{"content": "公诉机关指控,2018年5月17日23时25分许,被告人张某驾驶鲁Q?????号?东风?牌小型面包车行驶至临沭县沭赣路南古派出所处时与他人发生争执,临沭县公安局南古派出所接报警后到场处置,发现其涉嫌酒后驾驶机动车,遂通知临沭县交警大队进行处理。2018年5月23日,经临沂市公安局刑事技术检验张某血液中的乙醇含量为180.1mg/100ml,属醉酒驾驶机动车。\n", "summary": "根据刑法第133条，判处张某危险驾驶罪"}
+{"content": "公诉机关指控:2017年10月18日,被告人杜某甲以8000元的价格购买了高某甲位于临沭县临沭镇后高湖村村西河东岸的杨树64棵,19日至20日,杜某甲未经林业部门核准并办理林木采伐许可证,雇佣工人将购买的杨树全部采伐。经测量,被采伐的树木总蓄积20.2636立方米。\n", "summary": "根据刑法第345条，判处杜某甲滥伐林木罪"}
+{"content": "山东省临沭县人民检察院指控:\n1、2014年12月份的一天,被告人徐某以1000元价格收购张2某、陈猛(均已判刑)盗窃被害人胡某甲的三轮电动车一辆,该车价值1700余元。\n2、2014年12月份的一天,被告人徐某以1600元价格收购张2某、陈猛盗窃被害人崇乾涛的雅马哈摩托车一辆,该车价值4000元。\n3、2014年11月份的一天,被告人徐某以3300元价格收购张2某、陈猛盗窃被害人胡某乙的三轮电动车一辆,该车价值5000余元。\n", "summary": "根据刑法第312条，判处徐某掩饰、隐瞒犯罪所得、犯罪所得收益罪"}
+{"content": "临沭县人民检察院指控,2018年4月20日20时50分许,被告人朱某伟驾驶?大众?牌小型轿车行驶至临沭县城沭河大街与冠山路交汇处时,因涉嫌酒后驾驶机动车被临沭县交警大队民警查获。2018年5月7日,经临沂市公安局刑事技术检验朱某伟血液中乙醇含量为125.9mg/100ml。属醉酒驾驶机动车。\n公诉机关为证明上述事实提供了鉴定意见;书证;被告人朱某伟的供述与辩解等。\n公诉机关认为,被告人朱某伟在道路上醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××三十三条××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处朱某伟危险驾驶罪"}
+{"content": "临沭县人民检察院指控:2017年9月份以来,被告人王某军在山东X肥业有限公司任销售员期间,利用职务上的便利,违反公司规定,让公司客户将购买肥料的货款打到自己的私人账户,侵占公司财产共计341500元。分述如下:\n1、2017年9月18日,被告人王某军让公司客户张某全把货款15万元转到自己的个人银行账户。后该款被王某军挥霍。\n2、2017年12月3日,被告人王某军让其公司客户董某文把货款126500元转到其个人银行账户。后王某军将该款挥霍。\n3、2018年1月21日,被告人王某军让其公司客户肖某把货款65000元转到其个人银行账户。后王某军将该款挥霍。\n为证明上述指控,公诉人当庭出示并宣读了书证,证人张某全、董某文、肖某等人的证言,接受刑事案件登记表、银行账户交易信息等书证;被告人的供述与辩解等证据。\n公诉机关认为,被告人王某军利用职务上的便利,将本单位财务非法占为己有,数额较大,其行为侵犯了公私财产权,触犯了《中华人民共和国刑法》××××之规定,应当以××追究其刑事责任。\n", "summary": "根据刑法第271条，判处王某军职务侵占罪"}
+{"content": "临沭县人民检察院指控:2018年4月12日12时55分许,被告人杨某乐驾驶鲁Q?????号?雪佛兰牌?小型轿车沿临沭县青石路由南向北行驶,行至白旄西街路口时,车辆撞至路边张某全、高某希,致张某全死亡,高某希受伤,车辆及周围部分物品受损。经认定杨某乐无驾驶资格驾驶机动车、未确保安全车速,负事故全部责任。\n公诉机关为证明上述事实提供了证人证言、书证、鉴定意见、抓获经过、户籍信息等书证,被告人的供述等。\n公诉机关认为,被告人杨某乐违反交通运输管理法规,致一人死亡,其行为触犯了《中华人民共和国刑法》××之规定,应当以××追究其刑事责任。\n被告人杨某乐对公诉机关指控的犯罪事实自愿认罪,未作辩解。\n", "summary": "根据刑法第133条，判处杨某乐交通肇事罪"}
+{"content": "临沭县人民检察院指控,2017年10月15日20时许,被告人陈某波无证驾驶?开瑞?牌小型面包车行驶至临沭县城沭新大街与冠山路交汇处时,因涉嫌酒后驾驶机动车被临沭县交警大队民警查获。2017年11月9日,经临沂市公安局刑事技术检验陈某波血液中乙醇含量为159.7mg/100ml。属醉酒驾驶机动车。\n公诉机关为证明上述事实提供了鉴定意见;书证;被告人陈某波的供述与辩解等。\n公诉机关认为,被告人陈某波在道路上醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××三十三条××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处陈某波危险驾驶罪"}
+{"content": "山东省临沭县人民检察院指控,2015年7月,被告人王某用开始从金沂蒙集团收购生产废料高碳醇,同年8月份以来,王某用许诺给被告人李某恩(原金沂蒙集团蒸馏工段长)好处费,李某恩采取将应当留存的物料私自混入和故意缩短蒸馏时间的方式,使王某用收购的高碳醇中含有较多高价值的异戊醇、正丙醇、异丁醇。后王某用将收购的高碳醇高价卖给王某献和王某伍,王某用非法获利50000余元,李某恩收受王某用好处费16000元。被告人王某用、李某恩以上述方式侵占金沂蒙集团财物价值83803.22元。\n公诉机关认为,被告人王某用、李某恩利用职务上的便利,将本单位财物非法占为己有,数额较大,其行为触犯了《中华人民共和国刑法》××××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任,根据《中华人民共和国刑事诉讼法》××的规定,提起公诉,请依法判处。\n被告人王某用对公诉机关指控的犯罪事实及罪名均无异议,且自愿认罪。\n被告人李某恩对公诉机关指控的犯罪事实及罪名均无异议,且自愿认罪。\n", "summary": "根据刑法第271条，判处王某用李某恩职务侵占罪"}
+{"content": "公诉机关指控,2018年2月3日,被告人梁某及陈2某(另案处理)夫妻二人组织庞某、李某、刘某、官某及陈1某(另案处理)等人在临沭县郑山街道郑山村梁某的家中加工病死猪肉,办案民警在现场共扣押病死猪肉2200公斤。经临沭县畜牧局检验,现场猪肉均为病死猪产品。\n公诉机关提供了鉴定意见;书证;被告人梁某、官某、刘某、庞某、李某的供述与辩解等证据。\n公诉机关认为,被告人梁某、官某、刘某、庞某、李某涉嫌生产不符合安全标准的食品,足以造成严重食品中毒事故或者其他严重食源性疾病,其行为均已触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应当以生产不符合安全标准的食品罪追究五被告人的刑事责任。被告人梁某系累犯,应当依照《中华人民共和国刑法》××的规定处罚。\n", "summary": "根据刑法第143条，判处官某梁某李某庞某刘某生产、销售不符合安全标准的食品罪"}
+{"content": "临沭县人民检察院指控,2018年5月24日14时45分许,被告人吴某东驾驶?昊锐?牌小型轿车行驶至临沭县城沭新大街与冠山路交汇处时,因涉嫌酒后驾驶机动车被临沭县交警大队民警查获。2018年6月4日,经临沂市公安局刑事技术检验吴某东血液中乙醇含量为184.4mg/100ml。属醉酒驾驶机动车。\n公诉机关为证明上述事实提供了鉴定意见;书证;被告人吴某东的供述与辩解等。\n公诉机关认为,被告人吴某东在道路上醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××三十三条××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处吴某东危险驾驶罪"}
+{"content": "临沭县人民检察院指控,2016年11月20日21时许,被告人吴某涛驾驶二轮摩托车,沿临沭县中山路道路东侧由北向南行驶,行至事故地点时,与对向行驶的张某龙驾驶的小型轿车相撞,致吴某涛受伤,车辆部分受损。2016年12月5日,经临沂市公安局刑事技术检验吴某涛血液中乙醇含量为132.4mg/100ml。属醉酒驾驶机动车。\n公诉机关为证明上述事实提供了现场勘查笔录;鉴定意见;书证;证人张某龙、杨某的证言;被告人吴某涛的供述与辩解等。\n公诉机关认为,被告人吴某涛在道路上醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××三十三条××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处吴某涛危险驾驶罪"}
+{"content": "公诉机关指控,2017年6月13日被告人焦某从贵州窜至临沭县城戴着口罩、手套在街上转悠,至6月14日凌晨本人步行来到临沭县阳光丽景19号楼3单元,自己用其事先自制的文件夹塑料卡片,在该单元开了五、六家的入户门没开开,来到902室门口将该住户张某家入户门开开后,进入其家中,在其家中未找到值钱的东西,离开时将储物架上的车钥匙(宝某车510,车牌号鲁Q?????)偷走,后到楼下用其盗窃的车钥匙将张某停放在楼下的汽车门打开,将张某放在车内副驾驶座上的塑料包内的零钱及放在工具箱里面的面值100元的现金盗走,将其车钥匙放在副驾驶座上后离开,共盗走现金17000元。\n公诉机关提供了被告人供述、被害人陈述、现场勘验笔录、书证、现场指认照片等证据。\n公诉机关认为,被告人焦某入户盗窃他人财物,数额较大,其行为侵犯了公民财产权,触犯《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。被告人焦某系累犯,应当依照《中华人民共和国刑法》××的规定处罚。\n", "summary": "根据刑法第264条，判处焦某盗窃罪"}
+{"content": "山东省临沭县人民检察院指控:2017年10月17日22时许,被告人陈某楠驾驶黑色尼桑轿车行至临沭县X批发市场附近时,与步行经过的被害人官某国发生口角并互相厮打,后当晚与陈某楠一同吃饭的张某得(另案处理)等人赶至现场,官某国遂逃跑至临沭县X公寓,被告人陈某楠和张某得进小区找到官某国后,张某得对官某国实施殴打,陈某楠、张某得等人强行将官某国带上其黑色尼桑轿车,后陈某楠驾车将官某国带至临沭县X水库附近和临沭县X投资公司,在临沭县X投资公司,陈某楠、张某得向官某国索要55000元赔偿金。因官某国未借到钱,张某得强迫官某国在两份借款合同和一份欠条(金额均为10万元)上面签字,后陈某楠、张某得驾车将官某国带至临沭县X商务宾馆,于10月18日8时许让其离开。\n针对上述事实,公诉人当庭宣读并出示了鉴定意见;书证;证人证言;被害人陈述;被告人陈某楠的供述与辩解等。\n公诉机关认为,被告人陈某楠以非法占有为目的,敲诈勒索他人财物,数额较大,其行为触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。被告人陈某楠已经着手实行犯罪,由于其意志以外的原因而未得逞,系犯罪未遂,应当依照《中华人民共和国刑法》××之规定处罚。\n", "summary": "根据刑法第274条，判处陈某楠敲诈勒索罪"}
+{"content": "公诉机关指控,2017年12月9日20时30分许,被告人张某驾驶鲁Q?????小型普通客车沿临沭县225线省道由南向北行驶,行至奥德燃气公司路口时,撞至前方顺行的临沭县郑山街道郑山村村民庞2某驾驶的电动三轮车,致庞2某受伤,后经医院抢救无效死亡。经认定被告人张某负事故全部责任。\n公诉机关提供了勘查笔录;鉴定意见;书证;证人证言;被告人张某的供述与辩解等证据。\n公诉机关认为,被告人张某违反交通运输管理法规,因而发生重大事故,致一人死亡,其行为危害了公共安全,触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n", "summary": "根据刑法第133条，判处张某交通肇事罪"}
+{"content": "公诉机关指控:2017年6月份以来,被告人李某林、郭某在临沭县惠民街水果批发市场西陆续租用十余间沿街店铺,将租用的沿街店铺作为按摩房,购置卖淫设施、提供卖淫工具,招募方某懿、肖某荣、丁某英、孙某等人在按摩房内从事卖淫,由郭某负责日常管理,并规定了每次卖淫的时间和价格,购买了对讲机,编制暗号,郭某、李某林使用对讲机为卖淫女站岗、望风,并抽取嫖资的百分之三十分成。同年12月14日被公安机关查获。\n", "summary": "根据刑法第358条，判处李某林郭某组织、强迫、引诱、容留、介绍卖淫罪"}
+{"content": "靖安县人民检察院指控:\n2017年11月15日凌晨3时左右,吸毒人员黄某1通过微信联系被告人江某想购买毒品,之后被告人江某在其租住的位于靖安县种子公司大院住所出售了约0.3克冰毒和一粒麻果给黄某1,约定200元毒资之后再给,黄某1当场在被告人江某家中将所买毒品吸食完后离开。当天凌晨6时许,吸毒人员黄某1再次来到被告人江某租住的位于靖安县种子公司大院住所要求买毒品,被告人江某当场出售了约0.3克冰毒和一粒麻果给黄某1,约定200元毒资之后再给。当天13时27分,黄某1通过微信将400元毒资转给了被告人江某。2017年11月21日侦查机关在办案过程中在被告人江某租住房间内查获其0.6克冰毒。\n针对上述事实,公诉机关当庭出示、宣读了如下证据:1、查获扣押的毒品等物证;2、被告人的户籍信息、前科等书证;3、辨认笔录、检查笔录、毒品称量笔录;4、(宜)公(司)鉴(化)字[2017]246号检验报告等鉴定意见;5、证人黄某1、舒某1、黄某2的证言;6、被告人江某的供述和辩解;7、微信转账记录等电子证据;8、现场检测报告书等证据材料。\n公诉机关认为,被告人江某以贩养吸,两次向吸毒人员黄某1出售毒品甲基苯丙胺和甲基苯丙胺片剂,其行为触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实、充分,应当以贩卖毒品罪追究其刑事责任。被告人江某在刑罚执行完毕五年之内再犯应当判处××以上刑罚之罪,系累犯,应当从重处罚,同时适用《中华人民共和国刑法》××××之规定。\n被告人江某对公诉机关指控的犯罪事实无异议。\n", "summary": "根据刑法第347条，判处江某走私、贩卖、运输、制造毒品罪"}
+{"content": "内黄县人民检察院指控,2010年5月,被告人何2某、王某某为使公司顺利运转预谋向社会融资,经他人介绍宣传该公司后以公司缺少资金为由、以月息3分和向储户返点为诱饵多次在安阳市区非法吸收公众存款167户208笔980.43万元,2011年11月,二被告人分两次非法吸收内黄县城关镇刘五桥存款40万元,造成损失594.39万元。\n", "summary": "根据刑法第176条，判处王某某非法吸收公众存款罪"}
+{"content": "澧县人民检察院起诉指控:2017年5月16日20时许,被告人汪某饮酒后驾驶湘J?????号小型汽车由西往东行驶至圃园路湘北钢材路段时,被澧县公安局交通警察大队执勤民警查获。经抽血检测:被告人汪某血液中乙醇含量为94.7mg/100ml,系醉酒驾驶机动车。\n", "summary": "根据刑法第133条，判处汪某危险驾驶罪"}
+{"content": "白银市平川区人民检察院指控,2017年10月14日0时许,被告人魏某酒后驾驶???号小轿车沿白银市平川区仿古街由北向南行驶至仿古街南口时,被执勤民警查获。现场对其进行呼气式酒精测试后,民警将魏某带至平川区人民医院抽取血样。经白银市公安司法鉴定中心鉴定,魏某血样中乙醇含量为81.93mg/100ml。\n公诉机关认为,被告人魏某的行为已触犯了刑法,构成××,依法应予惩处,建议对其判处一个月至二个月××,并处罚金4000元至5000元。\n", "summary": "根据刑法第133条，判处魏某危险驾驶罪"}
+{"content": "白银市平川区人民检察院指控,2017年9月1日11时许,吸毒人员何某通过微信联系被告人张某欲购买毒品。何某通过微信向张某转账1300元,后二人一起到兰州购买毒品。到兰州后张某又联系了马某购买毒品,并通过微信向马某转账1300元,购买2克毒品。张某与何某根据马某的指示在兰州小西湖一居民楼楼道取上外用卫生纸包裹,内用塑料袋包裹的白色粉末状毒品1小包。拿到毒品后二人吸食了一部分,何某又从中分出一部分毒品给张某作为好处。二人返回平川后张某打电话叫来王某,三人在名仁酒店8710房间吸食毒品时,被公安人员当场抓获。从张某身上查获毒品疑似物1包,从8710号房间床下查获毒品疑似物1包。经鉴定,从张某处查获毒品疑似物净重0.28克,从床下查获的毒品疑似物净重1.17克,从中均检出毒品海洛因。\n公诉机关认为,被告人张某非法向他人出售毒品海洛因,其行为已构成贩卖毒品罪,依法应予惩处。因其具有自愿认罪的情节,建议对其判处一年以下××,并处罚金。\n", "summary": "根据刑法第347条，判处张某走私、贩卖、运输、制造毒品罪"}
+{"content": "白银市平川区人民检察院指控,2015年6月30日,被告人刘某驾驶车号为???银灰色长城轿车在白银市银光十字拉载被害人石某送往平川区共和镇小水村过程中,将石某新购买尚未使用的苹果派iphone6plus手机秘密窃取,案发第二天刘某插卡使用手机。经平川区价格认证中心鉴定,手机价值5194元。\n2017年4月8日刘某在庆阳市西峰区肖金镇被抓获。\n针对上述指控,公诉机关提供了物证、书证、被害人陈述、被告人供述、证人证言、辨认笔录及照片、鉴定意见等证据。公诉机关认为,被告人刘2某秘密窃取他人财物,数额较大,其行为已触犯了刑法,构成××,依法应当惩处。因其具有累犯的从重处罚情节,建议对刘2某判处十个月以下××,并处罚金。\n", "summary": "根据刑法第264条，判处刘某盗窃罪"}
+{"content": "白银市平川区人民检察院指控,2017年4月10日凌晨1时30分许,被告人王某1、李某在平川区乐雅路赵某公馆慢摇吧卫生间内,将桂某借故殴打后,带至桂某所在包厢,与包厢内其他人员无故发生争吵,周某上前劝阻时被王某1推倒在地,陈某用啤酒瓶殴打黄某头部,致其头破流血,在场的善1某海见状上前时被陈某用啤酒瓶击打头部倒地后,继而被陈某、李某、王某1等人踢打致其受伤住院治疗。期间,被告人李某还无故将冯某鼻部击打流血。善1某海伤经医院诊断为:1、头部外伤、头皮裂伤、头皮血肿;2、鼻骨骨折;3、左肩部软组织损伤。经鉴定,善1某海鼻骨骨折构成轻伤二级,左眼钝挫伤、头部损伤均构成轻微伤。黄某头部损伤构成轻微伤。\n2017年5月31日,被告人陈某主动投案,并如实供述了犯罪事实。\n案发后,被告人王某1、李某、陈某各赔偿被害人善1某海损失1万元并取得其谅解;陈某赔偿被害人黄某损失6千元并取得其谅解。\n针对上述指控,公诉机关提供书证、被害人陈述、证人证言、被告人供述等证据。\n公诉机关认为,被告人王某1、李某、陈某随意殴打他人,情节恶劣,其行为已触犯了刑法,构成××。被告人王某1具有如实供述、赔偿损失并取得谅解、前科劣迹的量刑情节,建议判处其一年六个月以上三年以下××;被告人李某具有如实供述、赔偿损失并取得谅解的量刑情节,建议判处其一年六个月以上三年以下××;被告人陈某具有自首、赔偿损失并取得谅解的量刑情节,建议判处其一年六个月以上三年以下××。\n", "summary": "根据刑法第293条，判处李某王某1陈某寻衅滋事罪"}
+{"content": "公诉机关指控并经审理查明,被告人聂某与李某于2008年2月28日在修文县久长镇登记结婚,并与李某生育两个孩子。2015年1月,被告人聂某在贵阳市一酒吧与贵阳市白云区牛场乡落刀村女村民孙某相识,并与孙某交往。后被告人聂某在未与李某解除婚姻关系的情况下,在贵阳市白云区牛场乡落刀村和孙某以夫妻关系共同生活,致孙某怀孕于2016年1月17日在贵阳市和谐阳光医院产下一女。孙某在贵阳市和谐阳光医院生产期间,被告人聂某以丈夫名义在剖宫产手术知情同意书等手续上签字。\n2016年8月25日,被告人聂某到修文县公安局投案自首。\n", "summary": "根据刑法第258条，判处聂某重婚罪"}
+{"content": "青海省海西州西部矿区人民检察院指控,2017年3月21日2时43分被告人赵某酒后驾驶小型普通客车行驶至国道315线1186公里处时,与前方同向行驶的重型半挂牵引车发生追尾碰撞。经茫崖行委公安局交警大队对其进行呼气式酒精检测(检测结果为45mg/100ml),并将被告人带至茫崖行委人民医院进行抽取血样后将血样送青海省公安厅刑事科学技术研究管理中心进行鉴定,经鉴定,被告人赵某血液中检测出乙醇成分,血醇浓度为157.9mg/100ml。\n针对上述指控的犯罪事实,公诉机关当庭出示了书证、鉴定意见,现场勘验、检查、辨认笔录及被告人供述等证据,据此认为被告人赵某醉酒后驾驶机动车,其行为已触犯《中华人民共和国刑法》××××之规定,构成××,特提起公诉,要求依法予以惩处。同时发表公诉意见称,被告人赵某具有自首情节,可从轻处罚,建议对其判处一至三个月××,并处罚金,可适用××。\n", "summary": "根据刑法第133条，判处赵某危险驾驶罪"}
+{"content": "青海省海西州西部矿区人民检察院指控,2016年10月20日18时左右,被告人王1某驾驶蓝色切诺基机动车从大都会往修理一条街行驶,当行驶至物资装备公司门前公路时与赵某驾驶的比某小型轿车发生侧面碰撞,并将物资装备公司门前灯箱及门柱撞坏。事发后茫崖行委公安局对其进行抽取血样后送青海省公安厅刑事科学技术研究管理中心进行鉴定。经鉴定,被告人王1某血液中检出乙醇成分,血醇浓度为281.1mg/100ml。\n针对上述指控的事实,公诉机关当庭出示了书证、证人证言、血样采集情况说明、理化检验鉴定意见书、现场勘验检查工作记录及被告人供述等证据,据此认为被告人王1某醉酒后驾驶机动车辆,其行为已触犯《中华人民共和国刑法》××××××规定,构成××,特提起公诉,要求依法惩处。同时发表公诉意见称,鉴于被告人王1某如实供述犯罪事实,可从轻处罚。但其血液血醇浓度为281.1mg/100ml,情节严重;且无证驾驶,应从重处罚,建议对被告人王1某判处四个月以下××并处罚金。\n", "summary": "根据刑法第133条，判处王1某危险驾驶罪"}
+{"content": "公诉机关指控:2014年6月30日20时30分许,糜某3在湖口县金砂湾九钢三期宿舍15栋下楼时,在四楼楼梯口遇见被告人段某某,段某某以糜某3不该看他为由,两人发生口角,糜某3便找来其堂哥糜某1及哥哥糜某2找段某某理论,糜某2与段某某又发生口角。因段某某喝了酒,糜某2便离开并来到三期宿舍对面麻将馆。后段某某又打电话给糜某2问明其所在位置后,于当晚21时许携带空啤酒瓶,带领袁某(已判决)以及携带三、四十厘米长刀具的易某(已判决)等人来到三期宿舍对面的麻将馆找糜某2。糜某2接到段某某电话后,从麻将馆的厨房找来一把菜刀放在口袋里以备段某某来打架。段某某、袁某、易某等人找到糜某2后,段某某又与糜某2发生言语冲突,继而双方发生打架。糜某2被段某某持啤酒瓶、易某持刀致伤后逃脱,后被糜某1等人送至湖口县中医院进行治疗。易某和段某某被糜某2所持的菜刀划伤且追赶糜某2未果后,段某某、易某、袁某三人便往主干道上走,随后碰到了同事刘某,就乘坐刘某的车去湖口县中医院治疗。到湖口县中医院后,段某某一直在中医院四楼进行包扎、手术,直至第二天才得知袁某用刀将糜某1捅伤的事实。经湖口县石钟司法鉴定中心鉴定,糜某2的颈部、左前臂尺侧、右前臂尺侧、背部均见不同长度划痕,其损伤程度评定为轻微伤;易某双上肢切割伤并右尺神经断裂,其损伤程度评定为轻伤二级;糜某1的左中腹开放性刀刺伤,空肠穿孔并系膜血肿,右上臂刀刺伤并肌腱断裂,左胸壁刀刺伤,其损伤程度评定为重伤二级。\n为证实上述指控,公诉机关向法庭出示了如下证据:1、书证:抓获经过、临时羁押证明、刑事判决书、常住人口信息;2、证人易某、袁某、糜某3、糜某1、刘某、邓某1、郭某、邓某2的证言;3、被告人段某某的供述与辩解;4、检查、辨认笔录;5、鉴定意见。\n公诉机关认为,被告人段某某为逞强耍横无事生非,并借此日常生活中的偶发矛盾而伙同他人持凶器随意殴打他人,情节恶劣,其行为触犯了《中华人民共和国刑法》××××第(一)项之规定,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任,请依法判处。\n被告人段某某对公诉机关指控无异议并自愿认罪。\n", "summary": "根据刑法第293条，判处段某某寻衅滋事罪"}
+{"content": "公诉机关指控:2014年,被告人周2某作为湖口县武山镇武联村党支部书记,暂管武联村下属小组村民的部分征地款。同年12月19日,因其表亲曹某在都昌县开发的房地产项目资金周转困难,周2某从其暂管的农商银行征地款账户,转50万元至曹某的合伙人用于资金周转。2015年1月6日,曹某通过其儿媳妇吴某的账户归还周2某505000元人民币,其中5000元为感谢费。\n公诉机关认为,被告人周2某利用其职务上的便利,挪用公款借给他人进行营利活动,数额较大,其行为触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实、充分,应以××追究其刑事责任。提请依法判处。\n", "summary": "根据刑法第384条，判处周2某挪用公款罪"}
+{"content": "公诉机关指控:2017年6月份至案发当晚(2017年7月21日),被告人王2某伙同被告人王1某及王某3(在逃)、史某(又名?现辉?,在逃)、王某4(外号?胖墩?,在逃)等人多次在湖某流泗镇神华电厂内的江西水电运转房、山东电建除尘工地等地利用傍晚下班之机,用剪线钳将电缆线剪成小段后藏在衣服里带出神华电厂大门,并将盗窃的电缆线立即全部卖给武某(另案处理)在湖某流泗镇杨山村经营的废品回收站。其中,被告人王2某伙同他人多次盗窃,所盗电缆线变卖后共获利9000余元;被告人王1某单独或伙同他人盗窃五次,所盗电缆线变卖后共获利1582元;2017年7月21日盗窃的电缆线价值2289元。\n公诉机关认为,被告人王2某和被告人王1某多次伙同他人盗窃电缆线,数额较大,其行为均构成××,应追究其刑事责任。被告人王1某案发后主动投案,并如实供述自己的罪行,属自首,可以从轻或减轻处罚。提请依法判处。\n", "summary": "根据刑法第264条，判处王1某王2某盗窃罪"}
+{"content": "绥江县人民检察院指控,2012年至2016年,被告人黄某某在担任绥江县南岸镇经济发展办公室主任期间,利用职务之便,在绥江县南岸镇互助村?石互公路?(石梁子至互助村委会)修建过程中,为他人谋取利益,以收取工作经费的名义多次收受他人贿赂,共计人民币53000元。1、2012年,绥江县南岸镇互助村村委会主任张某和副主任蒋某,为争取更多的扶贫经费修建?石互路?,在绥江县南岸镇政府办公楼黄某某的办公室,将事先准备好的13000元人民币送给黄某某。2、2013年4月左右,蒋某为了感谢黄某某对?石互路?修建的关心,在南岸镇政府办公楼黄某某的办公室,将事先准备好的5000元人民币送给黄某某。3、2013年底,张某为了感谢黄某某对?石互路?修建的关心,以及希望黄某某能为互助村争取更多的扶贫经费,在南岸镇政府办公楼黄某某的办公室,将事先准备好的5000元人民币送给黄某某。4、2015年年初,?互助路?项目资金划拨完毕后,张某为感谢黄某某的关心,通过以绥江县农业银行转账的方式,将20000元好处费转账至黄某某的农业银行账户。5、2016年年初,张某为了感谢黄某某对南岸镇互助村的关心与照顾,以绥江县农业银行转账的方式,将10000好处费转账至黄某某的农业银行账户。针对上述指控事实,公诉机关当庭出示了到案经过、证人证言、查封扣押财物清单、户口证明、被告人供述及相关书证予以证明。公诉机关认为,被告人黄某某的行为已构成××,鉴于其主动投案,并如实供述犯罪事实,成立自首,部分赃款用于购置办公用品、积极退赃,建议从轻处罚。\n被告人黄某某对公诉机关指控的事实及证据无异议,请求免予刑事处罚。\n", "summary": "根据刑法第385条第383条第386条，判处黄某某受贿罪"}
+{"content": "绥江县人民检察院指控,2017年6月,被告人郑某因知晓熊某1家有古董,便与被告人肖某商议到熊某1家盗窃。后被告人肖某邀约被告人杨1某,被告人杨1某又邀约被告人胡1某共同实施盗窃。2017年6月21日,被告人郑某、肖某、杨1某、胡2某邹某驾驶的川Q?????号长安牌轿车到达绥江县城,并由郑某指路到绍廷村熊某1家踩点。当晚,被告人郑某、肖某在外望风等待,被告人杨1某、胡1某翻窗入室进入熊某1家翻找钱物,不慎将熟睡的熊某1惊醒,二人用言语威胁其不得叫喊,并用绳索将熊某1手脚捆绑。后被告人杨1某将门打开,让被告人郑某、肖某进入屋内,二人在胡1某带领下翻找钱物。四人将熊某1家现金19800元、怀表一块、腰带两根、玉牌三块、美女机绣图四幅、龙凤呈祥字样木板一块等物盗走。针对上述指控事实,公诉机关当庭出示了受案登记表、抓获经过、证人证言、被告人供述及相关书证予以证明,认为被告人杨1某、胡1某在盗窃过程中当场使用暴力,其行为已构成××,被告人郑某、肖某秘密窃取他人数额较大的财物,已构成××。被告人杨1某、胡4某犯,依法应从重处罚;被告人郑某系前科人员,酌情从重处罚,有立功表现,可从轻或减轻处罚;四被告人如实供述犯罪事实,依法可从轻处罚;被告人杨1某、肖某积极退赃,可酌情从轻处罚。建议对被告人杨1某在××至十三年幅度内处罚,对被告人胡1某在××至十四年幅度内处罚,对被告人郑某在××至二年零六个月幅度内处罚,对被告人肖某在××至二年幅度内处罚。\n被告人杨1某对公诉机关指控的事实及证据无异议。其辩护人李某认为,被告人杨1某如实供述犯罪事实、积极退赃、认罪、悔罪态度较好,建议判处××。\n被告人胡1某对公诉机关指控的事实及证据无异议,辩解其系从犯、如实供述犯罪事实,请求从轻处罚。\n被告人郑某对公诉机关指控的事实及证据无异议。\n被告人肖某对公诉机关指控的事实及证据无异议。其辩护人胡3某认为,被告人肖某系从犯、初犯、如实供述犯罪事实、积极退赃,建议在一年××幅度内判处刑罚并宣告××。\n", "summary": "根据刑法第264条第269条，判处杨1某胡1某肖某郑某抢劫罪"}
+{"content": "梁河县人民检察院指控:\n1、2014年7月21日10时20分,梁河县公安局民警到梁河县芒东镇翁冷村委会丙那二组被告人王2某家中将王2某抓获,当场从王2某家中查获海洛因5.1克、甲基苯丙胺17.5克。经查,被告人王2某除自己吸食毒品外,还向方某3甲、谷某某、金某某、郭某某贩卖过毒品海洛因。\n2、2015年6月18日10时左右,梁河县公安局民警到梁河县芒东镇翁冷村委会丙那后山被告人王2某的窝铺中将王2某抓获,当场从王3某铺中查获海洛因0.2克,甲基苯丙胺7克。经查,被告人王2某除自己吸食毒品外,还向马某、曹某、张某、方某3乙贩卖过毒品海洛因,向钱某某贩卖过毒品海洛因和甲基苯丙胺。\n3、2015年1月2日晚上曹某、马某、张某在梁河县翁冷村委会丙那后山王2某的窝铺里与王2某商量去偷羊和王2某换毒品吸食的事情,王2某同意后,2015年1月3日2时左右,曹某、马某、张某三人就驾驶摩托车到梁河县另电站旁被害人杨某1的羊圈。张某进入被害人杨某1的窝铺内用刀挟持杨某1,在杨某1不敢反抗的情况下,马某和曹某用摩托车拉走了杨某1的5头山羊。经鉴定,被盗山羊的价格为人民币5600元。\n被告人王2某对公诉机关指控的事实及罪名无异议,请求法庭从轻判处。\n", "summary": "根据刑法第347条第263条第264条第348条，判处曹某马某王2某抢劫罪"}
+{"content": "绥江县人民检察院指控,1、2017年7月6日,被告人王某某来到绥江县中城镇新世纪商贸广场?宝威男装?将被害人张某放在服装店收银台的一部玫瑰金苹果7手机盗走,经鉴定价值人民币3999元。\n2、2017年9月5日,被告人王某某来到绥江县中城镇A区农贸市场二楼?前金时尚家电生活馆?门口将被害人钟某1放在腰包内的一部OPPOR9S手机盗走,经鉴定价值人民币1819元。\n3、2017年9月11日,被告人王某某来到绥江县中城镇龙行大道中段25号?意外金喜?服装店,将被害人胡某放在收银台的一部玫瑰金苹果6S手机盗走,经鉴定价值为人民币2819元。\n被告人王某某盗窃的三部手机共计价值人民币8637元。\n针对上述指控事实,公诉机关当庭出示了受案登记表、到案经过、证人证言、被害人陈述、价格鉴定结论书等证据予以证实。公诉机关认为,被告人王某某以非法占有为目的,多次秘密窃取他人财物,数额较大,其行为已触犯《中华人民共和国刑法》××之规定,应当以××追究其刑事责任。被告人王某某主动到公安机关投案,如实供述犯罪事实,属自首,且认罪悔罪表现较好,可从轻处罚;系累犯,应从重处罚;多次盗窃,且为吸食毒品而盗窃,可酌情从重处罚。建议判处××至二年,并处罚金。提请判处。\n被告人王某某对公诉机关指控的犯罪事实及证据无异议。\n", "summary": "根据刑法第264条，判处王某某盗窃罪"}
+{"content": "绥江县人民检察院指控,2017年6月2日11时30分许,彭某与陈某在绥江县中城镇A区农贸市场二楼?杨2某杀鸡店?门口,两人因生意纠纷发生口角,继而发生抓扯,导致陈某和彭某不同程度受伤,随后杨某1与杨某2、罗某某等人在绥江县中城镇A区农贸市场二楼?杨1某冷冻库?处持钢管、板凳对路过的陈某和杨某1实施殴打,杨某1的父亲杨3某在劝阻打架的过程中,也被杨某1、杨某2、罗某某等人殴打,导致杨3某、陈某、杨某1等人不同程度受伤。经鉴定,杨3某所受损伤为轻伤二级,陈某、杨某1所受损伤为轻微伤。针对上述指控事实,公诉机关当庭出示了受案登记表、户口证明、到案经过、住院病历、人体损伤程度鉴定书、证人证言等证据予以证实。\n公诉机关认为,被告人杨某1、杨某2、罗某某故意伤害他人身体,致一人轻伤二级、二人轻微伤,其行为已触犯了《中华人民共和国刑法》××××,应当以××追究其刑事责任。三被告人主动到公安机关投案,并如实供述犯罪事实,具有自首情节,可以减轻或从轻处罚,积极赔偿被害人损失,取得被害人谅解,可酌情从轻处罚。建议判处三被告人××至一年零六个月,并宣告××。\n被告人杨某1、杨某2、罗某某对公诉机关指控的事实和出示的证据均无异议。\n", "summary": "根据刑法第234条，判处罗某某杨某2杨某1故意伤害罪"}
+{"content": "乌海市乌达区人民检察院指控被告人孙某以非法占有为目的,采用虚假手段骗取他人现金89234.14元,数额巨大,其行为触犯了《中华人民共和国刑法》××之规定,应当以××追究其刑事责任。被告人孙某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人孙某在××至四年六个月幅度内科以刑罚,并处罚金。\n被告人孙某对公诉机关的指控无异议。其辩护人刘某的辩护意见是:被告人孙某系初犯、偶犯,具有坦白情节,认罪悔罪,希望法庭本着教育、挽救的方针,对其判处三年六个月××。\n", "summary": "根据刑法第266条，判处孙某诈骗罪"}
+{"content": "乌海市乌达区人民检察院指控被告人张某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人张某的刑事责任。被告人张某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人张某在××至五个月幅度内科以刑罚,并处罚金。\n被告人张某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处张某危险驾驶罪"}
+{"content": "公诉机关指控被告人刘1某、苏某、黄某共同故意以暴力、威胁方法阻碍公安人员依法执行职务,造成一名民警受伤,其行为已触犯《中华人民共和国刑法》××××、××,××××之规定,应当以××追究三被告人的刑事责任。被告人刘1某在共同犯罪中起主要作用,根据《中华人民共和国刑法》××××之规定,系主犯。被告人苏某、黄某在共同犯罪中起次要作用,根据《中华人民共和国刑法》××之规定,系从犯,应当对被告人苏某、黄某从轻处罚。被告人刘1某、苏某、黄某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人刘1某在××至二年幅度内科以刑罚。对被告人苏某在××至六个月幅度内科以刑罚。对被告人黄某在××至六个月幅度内科以刑罚。\n被告人刘1某对公诉机关的指控无异议。其辩护人李某的辩护意见是:被告人刘1某犯罪情节轻微,主观恶性较小,自愿认罪,悔罪态度好,主动赔偿被害人的经济损失。建议对被告人刘1某适用××。\n被告人苏某对公诉机关的指控无异议。\n被告人黄某对公诉机关的指控无异议。\n", "summary": "根据刑法第277条，判处苏某刘1某黄某妨害公务罪"}
+{"content": "乌海市乌达区人民检察院指控被告人刘某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人刘某的刑事责任。被告人刘某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人刘某在××至二个月幅度内科以刑罚,并处罚金。\n被告人刘某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处刘某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人宋某多次容留他人注射毒品海洛因,其行为触犯了《中华人民共和国刑法》××之规定,应当以××追究其刑事责任。被告人宋某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人宋某在××至一年六个月幅度内科以刑罚,并处罚金。\n被告人宋某对公诉机关的指控无异议。其辩护人查某的辩护意见是:被告人宋某具有坦白情节,危害后果小,主观恶性小。建议对被告人适用××。\n", "summary": "根据刑法第354条，判处宋某容留他人吸毒罪"}
+{"content": "公诉机关指控被告人王某2018年1月13日15时许驾驶牌照为???的小型轿车沿乌海市乌达区文体路由北向南行驶至与新达街交叉路口处时,与沿新达街由西向东骑行电动二轮车的被害人方某发生碰撞,造成方某受伤,经医院抢救无效死亡的重大交通事故。被告人王某负此次事故的主要责任。案发后被告人王某与被害人近亲属自行达成经济赔偿协议,已取得被害人近亲属的谅解。建议本院对被告人王某在六个月至一年六个月××幅度内科以刑罚。\n本案在审理过程中,依法委托乌海市乌达区司法局就被告人的有关情况进行社会调查,结论为同意对被告人适用社区矫正。\n", "summary": "根据刑法第133条，判处王某交通肇事罪"}
+{"content": "乌海市乌达区人民检察院指控被告人李某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人李某的刑事责任。被告人李某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人李某在××至二个月幅度内科以刑罚,并处罚金。\n被告人李某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处李某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人陶某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人陶某的刑事责任。被告人陶某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人陶某在××至二个月幅度内科以刑罚,并处罚金。\n被告人陶某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处陶某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人薛1某向他人贩卖毒品海洛因,计0.05克;非法持有毒品海洛因与甲基苯丙胺(冰毒)共计87.464克,其行为触犯了《中华人民共和国刑法》××××、××、××之规定,应当以贩卖毒品罪、××追究其刑事责任。被告人薛1某在判决宣告前一人犯数罪,根据《中华人民共和国刑法》××之规定,应对其数罪并罚。被告人薛1某在刑罚执行完毕后五年内又犯应当判处××以上刑罚之罪,系累犯,根据《中华人民共和国刑法》××××之规定,应当从重处罚。被告人薛1某曾犯贩卖毒品罪被判刑,现又犯××、贩卖毒品罪,系毒品再犯,根据《中华人民共和国刑法》××之规定,应当从重处罚。被告人薛1某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××之规定,可以从轻处罚。被告人薛1某协助司法机关抓捕贩毒人员吴某,有重大立功表现,根据《中华人民共和国刑法》××之规定,可以减轻处罚。建议本院在××六年至八年幅度范围内,对被告人薛2某以刑罚,并处罚金。\n被告人薛1某对公诉机关指控无异议。其辩护人的辩护意见是:1、对公诉机关的指控无异议。2、被告人薛1某因吸食毒品而非法持有毒品,主观恶性小。3、被告人薛1某贩卖毒品数量很少,主观上较被动。4、被告人薛1某有重大立功表现,认罪态度较好,可以从轻或减轻处罚。5、被告人薛1某家境困难,尚有一岁的儿子需其抚养。综上,被告人有法定从轻、减轻处罚的情节,建议对被告人在××五年以下量刑。\n", "summary": "根据刑法第356条第348条第347条，判处薛1某走私、贩卖、运输、制造毒品罪"}
+{"content": "乌海市乌达区人民检察院指控被告人李某以营利为目的,伙同他人组织多人赌博,赌资数额共计58900元,其行为触犯了《中华人民共和国刑法》××××、××××之规定,应当以××追究其刑事责任。被告人李某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人李某在××至一年六个月幅度内科以刑罚,并处罚金。\n被告人李某对公诉机关的指控无异议。其辩护人杨某的辩护意见是:对起诉书指控被告人李某构成××罪名及犯罪事实没有异议。被告人李某在赌博犯罪中起协助作用,系初犯、偶犯,具有坦白情节,可以从轻处罚。从邬某处扣缴的被告人李某的20000元不属于赌资,应该给予返还。\n", "summary": "根据刑法第303条，判处李某赌博罪"}
+{"content": "乌海市乌达区人民检察院指控被告人佟某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人佟某的刑事责任。被告人佟某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人佟某在××日至二个月十五日幅度内科以刑罚,并处罚金。\n被告人佟某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处佟某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人杨某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人杨某的刑事责任。被告人杨某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人杨某在××至二个月幅度内科以刑罚,并处罚金。\n被告人杨某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处杨某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人陈某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人陈某的刑事责任。被告人陈某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人陈某在××日至二个月十五日幅度内科以刑罚,并处罚金。\n被告人陈某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处陈某危险驾驶罪"}
+{"content": "公诉机关指控被告人刘1某、宋某以非法占有为目的,共同盗窃他人财物4290元,数额较大,其行为均已触犯《中华人民共和国刑法》××、××××之规定,应当以××追究二被告人的刑事责任。被告人刘1某、宋某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人刘1某在××至一年幅度内科以刑罚;对被告人宋某在××至六个月幅度内科以刑罚。对二被告人并处罚金。\n被告人刘1某、宋某对公诉机关的指控无异议。被告人宋某的辩护人王2某的辩护意见是:被告人宋某系初犯,主观恶性小,社会危害性较小。被告人家属主动退赔被害人全部损失并取得谅解,建议人民法院对被告人宋某免予刑事处罚。\n", "summary": "根据刑法第264条，判处刘1某宋某盗窃罪"}
+{"content": "乌海市乌达区人民检察院指控被告人王1某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人王1某的刑事责任。被告人王1某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人王1某在××日至二个月十五日幅度内科以刑罚,并处罚金。\n被告人王1某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处王1某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人王1某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人王1某的刑事责任。被告人王1某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人王1某在××日至二个月十五日幅度内科以刑罚,并处罚金。\n被告人王1某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处王1某危险驾驶罪"}
+{"content": "公诉机关指控被告人闫某、朱某结伙入室盗窃他人财物共计6900元,数额较大,其行为触犯了《中华人民共和国刑法》××、××××之规定,应当以××追究二被告人刑事责任。被告人闫某在刑罚执行完毕以后,五年以内再犯应当判处××以上刑罚之罪,系累犯,根据《中华人民共和国刑法》××××之规定,应当从重处罚。被告人闫某、朱某在归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××之规定,可以从轻处罚。并建议本院对被告人闫某在××一年六个月至二年六个月幅度内科以刑罚;对被告人朱某在××一年至二年幅度内科以刑罚。对二被告人并处罚金。\n被告人闫某对公诉机关的指控无异议。其辩护人查某的辩护意见是:对于起诉书指控被告人闫某犯××无异议。被告人闫某具有坦白情节,可以从轻处罚;其主观恶性较轻、认罪、悔罪态度好,可以酌情从轻处罚。建议对被告人闫某判处一年六个月××,并处少量罚金。\n被告人朱某对公诉机关的指控无异议。其辩护人王2某的辩护意见是:对公诉机关指控被告人朱某犯××无异议。被告人朱某具有坦白情节,可以从轻处罚;其在本案中处于从犯地位,应当减轻处罚。其自愿认罪,认罪态度良好,具有悔罪表现,建议对其适用××。\n", "summary": "根据刑法第264条，判处闫某朱某盗窃罪"}
+{"content": "乌海市乌达区人民检察院指控被告人何某在道路上醉酒驾驶机动车,其行为已触犯《中华人民共和国刑法》××××××第(二)项的规定,应当以××追究被告人何某的刑事责任。被告人何某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人何某在××日至二个月十五日幅度内科以刑罚,并处罚金。\n被告人何某对公诉机关的指控无异议。\n", "summary": "根据刑法第133条，判处何某危险驾驶罪"}
+{"content": "乌海市乌达区人民检察院指控被告人吕1某向他人贩卖毒品安钠咖140.85克,被告人牛1某向他人贩卖毒品安钠咖140.85克,二被告人的行为均已触犯《中华人民共和国刑法》××××、××之规定,应当以贩卖毒品罪追究二被告人的刑事责任。被告人吕1某系毒品再犯,根据《中华人民共和国刑法》××之规定,应当从重处罚。被告人吕1某系盲人,根据《中华人民共和国刑法》××的规定,可以从轻处罚。被告人吕1某、牛1某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。建议本院在××至五个月幅度范围内,对被告人吕2某以刑罚,并处罚金;在××至二年幅度范围内,对被告人牛2某以刑罚,并处罚金。\n被告人吕1某对公诉机关指控无异议,并自愿认罪。其辩护人的意见是:1、对公诉机关的指控无异议。2、被告人吕1某如实供述自己的罪行,系坦白,并当庭认罪,可以从轻处罚。3、被告人吕1某系盲人,可以从轻处罚。4、被告人吕1某贩卖少量毒品,犯罪情节轻微,社会危害性小。且其因生活所迫,用毒品抵所欠债务,酌情可以从轻处罚。综上,建议对被告人吕1某判处××六个月。\n被告人牛1某对公诉机关指控无异议,并自愿认罪。其辩护人的意见是:1、对公诉机关的指控无异议。2、被告人牛1某如实供述自己的罪行,系坦白,可以从轻处罚。3、被告人牛1某系初犯,且其并不明知买卖安钠咖的行为是犯罪,其主观恶性较小,酌情可以从轻处罚。建议对被告人牛1某在××至一年的幅度内量刑,免除罚金处罚。\n", "summary": "根据刑法第356条第347条，判处牛1某吕1某走私、贩卖、运输、制造毒品罪"}
+{"content": "公诉机关指控被告人董某、郝某、王2某、边某共同盗窃原煤共17360千克,价值8332.8元,数额较大,其行为均已触犯《中华人民共和国刑法》××、××××之规定,应当以××追究四被告人的刑事责任。被告人王2某、郝某曾被判处××,在刑满释放后五年内又犯应当判处××以上刑罚之罪,根据《中华人民共和国刑法》××××之规定,系累犯,应当从重处罚。被告人董某、郝某、王2某、边某归案后如实供述自己的罪行,系坦白,根据《中华人民共和国刑法》××××的规定,可以从轻处罚。并建议本院对被告人王2某在××九个月至一年九个月幅度内科以刑罚;对被告人郝某在××九个月至一年九个月幅度内科以刑罚;对被告人董某在××六个月至一年六个月幅度内科以刑罚;对被告人边某在××六个月至一年六个月幅度内科以刑罚。对四被告人并处罚金。\n被告人王2某、郝某、董某、边某对公诉机关的指控无异议。\n", "summary": "根据刑法第264条，判处边某郝某董某盗窃罪"}
+{"content": "公诉机关指控:被告人张某于2018年6月5日1时20分许,在北京市朝阳区红坊路与牌坊村路交叉口处,酒后驾驶车牌号为×××号白色“福迪”牌小型普通客车闯红灯由南向西左转时,与一辆由西向东正常行驶的轻型厢式货车发生交通事故,造成两车损坏。经交通事故责任书认定,张某醉酒后驾驶机动车闯红灯左转弯时影响相关车道内的机动车正常行驶,与本起道路交通事故的发生有因果关系,是事故发生的全部原因;张某为全部责任,白某为无责任。经《酒精检验报告》认定其血液中酒精含量为173.3mg/100ml。被告人张某于2018年6月5日被传唤到案。已签署《认罪认罚具结书》。公诉机关认为,被告人张某的行为触犯了《中华人民共和国刑法》××××,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。鉴于被告人自愿认罪认罚,建议判处被告人张某××至三个月,并处罚金。\n", "summary": "根据刑法第133条，判处张某危险驾驶罪"}
+{"content": "公诉机关指控:被告人刘某于2018年6月5日0时45分许,饮酒后驾驶比亚迪牌小型普通客车(车牌号:×××)在北京市朝阳区十八里店南桥下行驶时,与他人驾驶的机动车发生事故。经刑事科学技术鉴定,被告人刘某血液中酒精含量为118.2mg/100ml。被告人刘某案发后在现场等候公安机关到场处理。被告人已签署《认罪认罚具结书》。被告人刘某醉酒驾驶机动车,其行为触犯了《中华人民共和国刑法》××××××,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。鉴于被告人自愿认罪认罚,建议判处被告人刘某一个月以上三个月以下××,并处罚金。\n", "summary": "根据刑法第133条，判处刘某危险驾驶罪"}
+{"content": "公诉机关指控:2017年6月25日16时许,被告人胡某在位于北京市西城区白广路的维也纳酒店内,因琐事与被害人潘某发生口角,手持玻璃杯砸伤潘某头部,造成潘某额部多处皮肤裂伤。经法医鉴定,被害人潘某身体所受损伤程度为轻伤二级。\n被告人胡某于2018年1月4日被民警抓获归案。现被告人胡某亲属已代其对被害人潘某进行了经济赔偿,并获得谅解。\n公诉机关根据被告人胡某所具有的如实供述自己的罪行、认罪认罚、当事人和解等从轻处罚量刑情节,建议判处××至五个月。\n", "summary": "根据刑法第234条，判处胡某故意伤害罪"}
+{"content": "北京市丰台区人民检察院指控:\n2011年至2013年间,被告人金1某在担任丰台区长辛店村党总支书记、村委会主任、村宅基地腾退补偿认定工作组组长期间,利用主管长辛店村腾退补偿安置全部工作的职务之便,采用弄虚作假,重复计算腾退拆迁安置人口的方法,对被告人金1某本人、其丈夫韩某1和其儿子韩某2三人重复安置,骗取国家拆迁安置补偿款117.64万某(已被依法冻结)及两套回迁安置房。\n针对指控的犯罪事实,公诉机关提供了相应的证据。公诉机关认为,被告人金1某的行为已构成××,要求依照《中华人民共和国刑法》××、××之规定,予以惩处。\n被告人金1某对公诉机关的指控予以否认,并辩解称其没有利用职权享受重复安置,所有的安置都是经过正规程序,且北关外74号的拆迁安置是其婆婆袁某办理的。\n", "summary": "根据刑法第383条第382条，判处金1某贪污罪"}
+{"content": "吉林省公主岭市人民检察院指控:2018年5月3日下午15时许,被告人边某醉酒后驾驶???号银色捷达车从黑林子镇四季香饭店行驶至黑林子镇市场附近,与???号大客车相撞,后被公安机关查获。经公主岭市公安司法鉴定中心检验:从送检的边某静脉血中检出乙醇,含量为213.08mg/100ml。案发后,被告人边某被公安机关依法传唤到案。\n", "summary": "根据刑法第133条，判处边某危险驾驶罪"}
+{"content": "吉林省公主岭市人民检察院指控:2018年3月7日晚,被告人乔某、李某以非法占有为目的,在公主岭市朝阳坡镇东兴村六组乔某家,使用威胁的办法,让被害人王某1写下一张3万元的欠条,2018年3月10日,被害人王某1被迫交付乔某人民币5000元。\n", "summary": "根据刑法第274条，判处乔某李某敲诈勒索罪"}
+{"content": "吉林省公主岭市人民检察院指控:2018年3月30日11时许,被告人付某驾驶???号轻型普通货车在范家屯镇马家洼子村四组自家门向院内倒车过程中将后方行人刘某碾压,致刘某当场死亡。经公主岭市公安司法鉴定中心鉴定:死者刘某系因重度颅脑损伤而死亡;经公主岭市公安局交通管理大队事故中队认定:付某承担此事故的全部责任。案发后,被告人付某到公安机关投案。\n吉林省公主岭市人民检察院认为:被告人付某违反交通运输管理法规,因而发生重大事故,致一人死亡,其行为已触犯《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。\n被告人付某在庭审过程中,对公诉机关指控的犯罪事实予以供认,对公诉机关宣读、出示的证据无异议。\n", "summary": "根据刑法第133条，判处付某交通肇事罪"}
+{"content": "吉林省公主岭市人民检察院指控:2001年6月7日凌晨,被告人白某伙同杜2某、李某、朱某(三人已另案处理)预谋后,租车到公主岭市怀德镇(原双榆树乡)三道岗村三屯李某21家,找曾与杜2某打仗并已给予赔偿的李某21的儿子李某22要钱。因李某22不在家,杜2某便将被害人李某21殴打并要钱,在屋内翻钱未果,后被告人白某、杜2某、李某、朱某又乘车到怀德镇三道岗乡二屯潘某家,找曾与杜2某打仗并给予赔偿的潘某要钱。杜2某、李某、白某进入屋内,朱某手持木棒在门口守着,杜2某持刀向潘某要钱。潘某的女儿从柜内拿出兜子刚要给钱,白某伸手将兜内的二千余钱拿走,案发后被告人白某被公安机关抓获归案。\n吉林省公主岭市人民检察院指控上述犯罪事实所列举的证据有:被告人白某的供述;同案犯杜2某等人的供述与辩解;被害人李某21等人的陈述,判决书,违法犯罪记录查询等。\n吉林省公主岭市人民检察院认为:被告人白某以暴力、胁迫方法抢劫他人财物,且系入户抢劫,其行为已触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应当以××追究其刑事责任。\n被告人白某供认公诉机关指控的抢劫的犯罪事实,对公诉机关当庭出示、宣读的证据无异议,但辩解钱不是其拿走了。被告人白某的辩护人的辩护意见是:被告人白某当庭认罪,如实供述自己的罪行,在共同犯罪中起次要作用,应当认定为从犯,其行为取得了被害人的谅解等情节,希望法院从轻处罚。\n", "summary": "根据刑法第263条，判处白某抢劫罪"}
+{"content": "白山市浑江区人民检察院指控:被告人于1某以非法占有为目的,于2014年5月10日以办理手机存话费赠电话业务名义,骗取被害人张某人民币5000.00元,被其全部挥霍。案发后,赃款被全部追缴,返还被害人。\n", "summary": "根据刑法第266条，判处于1某诈骗罪"}
+{"content": "白2某浑江区人民检察院指控:被告人向阳于2018年3月13日14时55分许,酒后驾驶车牌号为???号正三轮载客摩托车,行驶至白2某浑江区达山路矿务局汽运处门前时,被白2某公安局交通警察支队浑江区大队民警拦截盘查,后公安人员将其带至白2某中心医院对向阳进行抽血采样。经白2某公安司法鉴定中心理化检验鉴定(白1某公鉴(理化)字【2018】66号鉴定意见):从送检的向阳血液中检出乙醇,含量为97.32mg/100ml。\n", "summary": "根据刑法第133条，判处向阳危险驾驶罪"}
+{"content": "吉林省公主岭市人民检察院指控:宝艺木业公司于2014年7月10日与吉林公主岭市农村商业银行股份有限公司签订《人民币额度借款合同》,并以宝艺木业公司所有的房屋及土地使用权设立抵押,贷款共计人民币1800万某。被告人林某未按贷款约定用途购买原材料,而是归还了吉林省宝艺木业有限公司欠小额贷款公司的借款,吉林省公主岭市农村商业银行股份有限公司贷款至今未还。案发后,被告人林某被公安机关抓获归案。\n吉林省公主岭市人民检察院指控上述犯罪事实所列举的证据有:被告人林某的供述与辩解,证人王某12、张某等的证言,到案经过,违法犯罪记录查询,四平市中级人民法院民事调解书等。\n吉林省公主岭市人民检察院认为,被告单位吉林省宝艺木业有限公司骗取贷款偿还债务;被告人林某作为直接责任人员,其行为触犯了《中华人民共和国刑法》××××之规定,犯罪事实清楚,证据确实、充分,应以骗取贷款罪追究被告人林某的刑事责任。\n被告人林某否认公诉机关指控的事实,对公诉机关当庭出示、宣读的证据部分有异议。被告人林某辩称,骗取贷款不成立,因为抵押物是真实的,评估也是真实的,贷款没有用于购买原材料,只能说是违约,贷款1800万某是事实,用于购买原材料了,从2014年7月25日左右放款的,其用于还贷了,还款后又贷款1800万某,该笔钱应属于倒贷。二辩护人均认为被告人林某不构成骗取贷款罪。辩护人毕某辩护认为:一、被告人林某在本案贷款过程中不存在欺骗行为。二、银行并非因受欺骗陷入错误认识,而向被告人发放贷款。三、本案没有造成银行贷款重大损失和情节严重的结果。四、贷款的用途上,刑罚××××,并未将不按用途使用贷款作为骗取贷款罪的构成要件。因此,公诉机关以被告人涉案贷款未按用途使用为由,认定被告人涉嫌构成骗取贷款罪,是不能成立的。\n", "summary": "根据刑法第175条，判处林某骗取贷款、票据承兑、金融票证罪"}
+{"content": "吉林省通榆县人民检察院指控:2017年6月15日21时40分许,被告人张1某驾驶轿车沿通榆县瞻榆镇一完小学南大墙东西道由西向东行驶被执勤交警查获,经白城市公安司法鉴定中心对张1某静脉血中乙醇含量进行鉴定,结果检出乙醇含量为92.5mg/100ml。\n被告人张1某对公诉机关指控其犯××的事实及罪名均无异议并当庭认罪。\n", "summary": "根据刑法第133条，判处张1某危险驾驶罪"}
+{"content": "公诉机关指控,被告人胡某于2017年9月3日10时许,到敦化市黄泥河林业局腰场的琵琶顶山上,自家放牛的地方,发现自家散放的一头黄毛公牛已经死亡,便用刀把牛分解成肉,将牛头和牛内脏扔到山上,将牛肉用车拉回家中。被告人胡某在明知牛死因不明并未经检疫部门检疫的情况下,为了减少损失,让被告人陈某在敦化市青沟子乡老屯村青松组的村道上将牛肉卖给村民,得款1300余元。\n针对上述事实,公诉机关提供了相关的证据材料,并认为被告人胡某、陈某生产、销售不符合安全标准的食品,足以造成严重食物中毒事故或其他严重食源性疾病,其行为均触犯了《中华人民共和国刑法》××之规定,犯罪事实清楚,证据确实充分,应以××追究其刑事责任。二被告人系共同犯罪,到案后均能如实供述自己的犯罪行为,系坦白,故对二被告人应依照《中华人民共和国刑法》××、××××、××××之规定予以处罚。根据《中华人民共和国刑事诉讼法》××之规定,提请本院依法判处。\n被告人胡某对公诉机关指控的犯罪事实和罪名均无异议,无辩解和辩护意见。\n胡某的辩护人对公诉机关指控胡某的犯罪事实和罪名均无异议,提出如下辩护意见:1.被告人胡某具有坦白情节,自愿认罪,且认罪态度好,有悔罪表现,应从轻处罚;2.胡某系初犯、偶犯,没有前科劣迹,一贯表现良好,因法律意识淡薄造成此次犯罪,请求对其从轻处罚;3.胡某的违法行为没有给顾客造成身体上的危害后果,且被告人同意积极支付赔偿款13000元。综上,请求对被告人胡某从轻处罚。\n被告人陈某对公诉机关指控的犯罪事实和罪名均无异议,无辩解和辩护意见。\n陈某的辩护人对公诉机关指控陈某的犯罪事实和罪名均无异议,提出如下辩护意见:1.被告人陈某主观恶性较小,犯罪性质和情节都明显较轻,客观上未造成严重损害结果;2.陈某到案后如实交代自己的罪行,归案后有深刻的悔罪表现和积极的赎罪心态;3.陈某系初犯、偶犯,一贯表现良好,请求对其从轻处罚;4.陈某已经认识并改正自己的错误,没有再犯的可能性,同意支付刑事附带民事公益诉讼赔偿款并缴纳罚金,可以适用××。综上,请求对被告人陈某从轻处罚并宣告××。\n", "summary": "根据刑法第143条，判处胡某陈某生产、销售不符合安全标准的食品罪"}
+{"content": "吉林省公主岭市人民检察院指控:被告人王1某(另案处理)伙同丛1某(另案处理)、左1某、丛2某某等人在2013年8月1日期间以个人、公主岭市泰某有限公司、公主岭市颖某手机电脑家电商场、公主岭市颖某手机数码科技城、公主岭市海尔家电体验馆的名义,虚构其购买、装修尚某酒店(包括附属地)、修建山庄及企业经营资金周转不开等急需资金的事实,隐瞒其有300万某贷款、颖某家电经营亏损的真相,以月利1.5分至5分不等的高利息为诱饵,先后从王2某等人95人手中借款达人民币2203.26万某【修建山庄借款43人,数额1348.88万某;以企业经营资金周转不开为由借款52人,数额854.38万某】。其中被告人左1某参与借款26万某,被告人丛2某某以保证人的身份参与借款20万某。\n2016年2月29日,被告人刘1某(另案处理)作为借款人,被告人丛2某某作为借款人的财产共有人在公主岭市华兴村镇银行贷款100万某。贷款方式:保证担保,期限一年,支付方式。贷款理由,公主岭市国家农业科技园区尚某酒店装修,刘1某找人冒充祖某,签订了虚假的关于尚某精致酒店装修施工合同。所得贷款被刘1某用于偿还外债及个人消费。被告人王1某、左1某以营业收入担保,至今此100万某贷款未归还。\n2016年5月31日,被告人王1某、左1某通过中国平安普慧投资咨询有限公司四平分公司,在上海陆家嘴国际金融资产交易市场股份有限公司贷款12万某,其提供的公主岭市泰某手机卖场的营业执照及个人工商银行卡、信用社银行卡交易流水非自己真实所有,此款给王1某使用。\n2016年6月24日,被告人王1某、左1某通过中国人民财产保险股份有限公司四平分公司,在中国光大银行股份有限公司长春飞跃路支行贷款7万某,其提供的公主岭市泰某手机卖场的营业执照及个人信用社银行卡交易流水非自己真实所有,此款给王1某所用。\n吉林省公主岭市人民检察院指控上述犯罪事实所列举的证据有被告人左1某、丛2某某的供述与辩解,被害人马某、时某、丁某1、苏某的陈述,贷款手续、银行流水、借条,证人凤某、王某、勾某、白某、丁某2、李某、张某等人的证言。\n吉林省公主岭市人民检察院认为,被告人左1某、丛2某某以非法占有为目的,虚构事实,骗取他人财物,数额巨大,其行为触犯了《中华人民共和国刑法》××之规定;以欺骗手段骗取银行贷款,数额较大,其行为触犯了《中华人民共和国刑法》××××之规定,犯罪事实清楚,证据确实、充分,应以诈骗罪、骗取贷款罪追究被告人左1某、丛2某某的刑事责任。\n被告人左1某、从小帅均否认公诉机关指控的事实,对公诉机关当庭出示、宣读的证据有异议。被告人左1某辩称,其行为不构成犯罪,其有配合公安机关调查王1某的事实,由公安机关出示的司法鉴定,贷款是刘1某贷的,在这之前刘1某一直在还钱,其给王1某打工,王1某让其去取钱其就去了,借款是王1某借的,其没有使用该钱。被告人左1某的辩护人认为:一、被告人左1某不构成诈骗罪。公诉机关指控王1某向他人出具借款欠条196笔,其中左1某署名的有三笔,分别是马某一笔6万某;时某两笔,每笔10万某,总计26万某。1、关于王1某向马某借款6万某,左1某在欠条上写?左1某代王1某取?一事。从主观上看,左1某没有?非法占有的?故意。该笔借款是王1某与马某的爱人刘某商量了借款的细节后,指派左1某去温州城代取的借款。这一点从左1某、王1某、马某的笔录中可以印证。左1某既没有参与该笔借款的商榷,也不知道该借款的真实用途,所以被告人左1某没有将该笔借款据为已有或者帮助王1某占有该笔借款的主观故意。该6万某借款的实际使用人是王1某,左1某没有使用该借款,更没有得到任何好处。从客观上来讲,左1某的借款行为不属于欺诈行为。从实际情况上来说,左1某在欠条上签字仅是证明王1某已收到该笔借款,是履行工作职责,从左1某、王1某、马某的证言可以证实,是王1某与马某之间存在借贷关系。左1某与王1某没有共同犯罪的故意。从证据材料看,没有证据证明左1某有诈骗的行为。因此,左1某的行为并不符合诈骗罪的构成要件,在没有证据的情况下认定被告人左1某伙同王1某诈骗该笔借款既没有事实依据,也没有法律依据。2、关于王1某向时某借款20万某,左1某在欠条写?中间人:左1某?一事。在主观上,左1某不具有非法占有的目的,在借款之初,左1某仅是向时某表达了王1某欲向时某借款的意图,并没有参与二人之间的借款细节和借款过程,也不知道借款的用途,也不关心时某是否借款给王1某。在客观上,左1某的签字行为是因为在借款到期后王1某未按时还款,时某来找王1某更换借条时,临时要求左1某在欠条上加上?中间人:左1某?,在左1某签字时,时某与王1某的借款程序已经完成,若王1某构成诈骗,左1某是在王1某诈骗行为完成后签字的,对该行为不应当认定是共同的故意犯罪行为,左1某的行为不具有违法性。时某的出借行为是基于与王1某是好朋友,且有高额利息,并不是因为左1某的打电话行为,更不是因为左1某在欠条上签字,因此,左1某的行为与时某的损害结果之间不具备因果关系。左1某并未使用该笔钱,亦没有得到任何好处,无论是从主观上还是客观上,左1某没有伙同王1某共同实施犯罪的目的,公诉机关指控该笔借款也是没有事实和法律依据的。综上,左1某的主观目的是应领导和好朋友的要求去完成一个正常的借贷关系,而不是非法占有的目的,更不存在伙同王1某共同占有的目的,左1某的行为不应认定为诈骗罪。二、被告人左1某不构成骗取贷款罪。左1某并没有采取欺骗手段。在平安惠普12万某、人保公司7万某的两笔贷款中,左1某是以注册登记在她名下的泰某手机卖场作为借款主体申请的贷款,并按照贷款要求提供了营业执照、银行流水等相关材料。公诉机关指控?左1某提供的银行流水非自己真实所有?,作为左1某构成本罪的依据是不能成立的。在两笔贷款中,左1某分别提供了其在工商银行和公主岭农村商业银行的银行卡贷款日前6个月的流水,两张卡的开户人均是左1某,两张卡的流水是泰某手机卖场及左1某作为会计的颖某企业的流水,因在日常经营中,两个法人单位都是通过左1某的银行卡来转入或转出账款,所以当以法人身份借款时,左1某提供的银行卡流水是可以作为还款能力的依据的。从李某、张某的笔录可以知道,在贷款中是既允许提供公司银行流水又允许提供个人流水的,本案中,左1某无论提供哪种银行流水,都是金融机构允许的,并不存在?非自己真实所有或者虚假交易的情形?,以左1某名义贷款的实际用款人是王1某,银行对此也是知情的,银行并不反对以他人名义贷款的行为,因此,不能据此认定左1某构成骗取贷款罪。2、左1某的行为并没有给金融机构造成重大损失,左1某的贷款金额共计19万某,根据相关法律的规定,左1某并不满足应当立案追诉的条件,在此之前左1某一直在偿还贷款,并未给金融机构造成损失。3、对于在华兴银行为刘1某贷款100万提供担保一事,左1某并未向银行及刘1某提供虚假的贷款材料,也没有与刘1某共谋签署虚假合同,没有与刘1某形成?骗?的故意;左1某的签字行为是被动的不是主动的,且左1某并未使用该笔贷款,也没有得到任何好处,此笔贷款一直在按时还款,所以该笔贷款,左1某不构成骗取贷款罪。三、左1某积极配合侦查机关、公诉机关及审判机关的工作,如实陈述案件事实,态度真诚,且没有前科劣迹、系初犯,望减轻处罚。\n", "summary": "根据刑法第175条第192条，判处左1某骗取贷款、票据承兑、金融票证罪"}
+{"content": "吉林省白山市人民检察院起诉书指控:2017年5月,被告人王2某、郑1某为谋取利益,合谋从朝鲜走私国家禁止进口的铜矿粉与铅锌矿粉进境,郑1某联系朝鲜货源,王2某联系走私人员王5某(在逃)。王5某伙同被告人史某、孙2某、魏某,于同年7月初的一天,在长白县马鹿沟镇犁田洞村前的鸭绿江边走私进境铅锌矿80余吨并运至王2某的某某浮选厂,于同年8月14日晚,在相同地点走私进境铜矿粉194.38吨、铅锌矿粉81.68吨并运至某某浮选厂。经国土资源部长春矿产资源监督检测中心鉴定,2017年8月14日走私进境的铜矿粉的含量为:金含量为0.62克/吨,银含量为542克/吨,铜含量为16.51%;铅锌矿粉含量为:银含量为441克/吨,铅为15.32%,锌为28.23%。\n吉林省白山市人民检察院指控上述事实的主要证据有:案件来源、归案情况、检测报告等书证;现场勘查图、现场照片、指认笔录及照片、指认车辆照片;治安监控照片及监控录像;证人王某乙、邵某甲、孙某等人证言;被告人郑1某、王2某、史某、孙2某、魏某供述与辩解等。\n吉林省白山市人民检察院认为,被告人郑1某、王2某、史某、孙2某、魏某违反国家禁止性规定,走私进境国家禁止进口的铜矿粉194.38吨、铅锌矿粉160余吨,情节严重,其五人行为触犯了《中华人民共和国刑法》××××之规定,犯罪事实清楚,证据确实充分,应当以××追究刑事责任。\n", "summary": "根据刑法第151条，判处郑1某魏某孙2某史某王2某走私国家禁止进出口的货物、物品罪"}
+{"content": "靖宇县人民检察院指控,被告人田某与被害人刘某曾因边沟的问题发生过争执,2017年9月11日,田某及妻子韩某驾车途经刘某家门口时,双方再次发生争执并厮打在一起,在厮打过程中田某使用拳头殴打刘某面部五、六拳,导致刘某面部受伤,经鉴定被害人刘某因钝性伤致左侧上颌骨额突骨骨折、左侧眶内壁骨折、鼻骨骨折、三枚牙齿脱落,身体损伤程度为轻伤二级(详见(靖)公(技)鉴(临)[2014]46号人体损伤程度检验鉴定书)。\n靖宇县人民检察院认为,被告人田某故意伤害他人身体,其行为触犯了《中华人民共和国刑法》,犯罪事实清楚,证据确实、充分,应当以××追究其刑事责任。同时,靖宇县人民检察院出具量刑建议书,认为被告人田某积极赔偿了被害人的经济损失,取得被害人谅解,且本案系由邻里之间偶发矛盾引发,建议对田某判处四个月至一年五个月××、××或××。\n", "summary": "根据刑法第234条，判处田某故意伤害罪"}
+{"content": "乐山市市中区人民检察院指控,2012年至2016年6月期间,陈某(另案处理)利用职务之便安排被告人赵某在乐山市中心苗圃单位公账上虚报汽油费套取公款作为补助,陈某、赵某共同贪污38,159元,赵某分得一半(19,079.5元)。\n2012年,乐山市中心苗圃借用四川润华生态园林工程有限公司资质中标威远县林业局向义镇四方村综合体庭院绿化苗木项目,后因项目合同内容改变,致乐山市中心苗圃不能履行合同,威远县林业局与该项目接手人罗某共同赔偿乐山市中心苗圃12元,其中10万某被陈某与赵某共同侵吞,陈某分得5万某,赵某分得5万某。\n公诉机关认为,被告人赵某身为国家工作人员,利用职务之便,伙同他人共同共同侵吞公共财物138,159元,其个人非法占有69,079.5元,数额较大,其行为构成××。根据《中华人民共和国刑事诉讼法》××之规定,提起公诉,请依法判处。\n被告人赵某及其辩护人对公诉机关指控被告人赵某的行为构成××均无异议。赵某当庭表示自愿认罪。\n", "summary": "根据刑法第382条第383条，判处赵某贪污罪"}
diff --git a/ptuning/train.sh b/ptuning/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..879bd0526f0e69279969313a2ea5a1752d782b7b
--- /dev/null
+++ b/ptuning/train.sh
@@ -0,0 +1,23 @@
+CUDA_VISIBLE_DEVICES=0 python main.py ^
+    --do_train ^
+    --train_file AdvertiseGen/train.json ^
+    --validation_file AdvertiseGen/dev.json ^
+    --prompt_column content ^
+    --response_column summary ^
+    --overwrite_cache ^
+    --model_name_or_path G:/CODE/Python/ChatGLM-6B-main ^
+    --output_dir output/adgen-chatglm-6b-pt ^
+    --overwrite_output_dir ^
+    --max_source_length 64 ^
+    --max_target_length 64 ^
+    --per_device_train_batch_size 1 ^
+    --per_device_eval_batch_size 1 ^
+    --gradient_accumulation_steps 16 ^
+    --predict_with_generate ^
+    --max_steps 3000 ^
+    --logging_steps 10 ^
+    --save_steps 1000 ^
+    --learning_rate 2e-2 ^
+    --pre_seq_len 512 ^
+    --quantization_bit 4
+
diff --git a/ptuning/train_chat.sh b/ptuning/train_chat.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0f5cdc241ef94f039c93df483ee76cb8668ce2a
--- /dev/null
+++ b/ptuning/train_chat.sh
@@ -0,0 +1,27 @@
+PRE_SEQ_LEN=8
+LR=1e-2
+
+CUDA_VISIBLE_DEVICES=0 python3 main.py \
+    --do_train \
+    --train_file $CHAT_TRAIN_DATA \
+    --validation_file $CHAT_VAL_DATA \
+    --prompt_column prompt \
+    --response_column response \
+    --history_column history \
+    --overwrite_cache \
+    --model_name_or_path THUDM/chatglm-6b \
+    --output_dir $CHECKPOINT_NAME \
+    --overwrite_output_dir \
+    --max_source_length 256 \
+    --max_target_length 256 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --predict_with_generate \
+    --max_steps 3000 \
+    --logging_steps 10 \
+    --save_steps 1000 \
+    --learning_rate $LR \
+    --pre_seq_len $PRE_SEQ_LEN \
+    --quantization_bit 4
+
diff --git a/ptuning/trainer.py b/ptuning/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..63101bc9d3dfb65ff5a444c7c151b8d4d241f2c9
--- /dev/null
+++ b/ptuning/trainer.py
@@ -0,0 +1,3830 @@
+# coding=utf-8
+# Copyright 2020-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
+"""
+
+import contextlib
+import functools
+import glob
+import inspect
+import math
+import os
+import random
+import re
+import shutil
+import sys
+import time
+import warnings
+from collections.abc import Mapping
+from distutils.util import strtobool
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+from tqdm.auto import tqdm
+
+
+# Integrations must be imported before ML frameworks:
+# isort: off
+from transformers.integrations import (
+    default_hp_search_backend,
+    get_reporting_integration_callbacks,
+    hp_params,
+    is_fairscale_available,
+    is_optuna_available,
+    is_ray_tune_available,
+    is_sigopt_available,
+    is_wandb_available,
+    run_hp_search_optuna,
+    run_hp_search_ray,
+    run_hp_search_sigopt,
+    run_hp_search_wandb,
+)
+
+# isort: on
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from huggingface_hub import Repository, create_repo
+from packaging import version
+from torch import nn
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from transformers import __version__
+from transformers.configuration_utils import PretrainedConfig
+from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
+from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
+from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+from transformers.dependency_versions_check import dep_version_check
+from transformers.modelcard import TrainingSummary
+from transformers.modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
+from transformers.optimization import Adafactor, get_scheduler
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_11
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.trainer_callback import (
+    CallbackHandler,
+    DefaultFlowCallback,
+    PrinterCallback,
+    ProgressCallback,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+)
+from transformers.trainer_pt_utils import (
+    DistributedLengthGroupedSampler,
+    DistributedSamplerWithLoop,
+    DistributedTensorGatherer,
+    IterableDatasetShard,
+    LabelSmoother,
+    LengthGroupedSampler,
+    SequentialDistributedSampler,
+    ShardSampler,
+    distributed_broadcast_scalars,
+    distributed_concat,
+    find_batch_size,
+    get_module_class_from_name,
+    get_parameter_names,
+    nested_concat,
+    nested_detach,
+    nested_numpify,
+    nested_truncate,
+    nested_xla_mesh_reduce,
+    reissue_pt_warnings,
+)
+from transformers.trainer_utils import (
+    PREFIX_CHECKPOINT_DIR,
+    BestRun,
+    EvalLoopOutput,
+    EvalPrediction,
+    FSDPOption,
+    HPSearchBackend,
+    HubStrategy,
+    IntervalStrategy,
+    PredictionOutput,
+    RemoveColumnsCollator,
+    ShardedDDPOption,
+    TrainerMemoryTracker,
+    TrainOutput,
+    default_compute_objective,
+    default_hp_space,
+    denumpify_detensorize,
+    enable_full_determinism,
+    find_executable_batch_size,
+    get_last_checkpoint,
+    has_length,
+    number_of_arguments,
+    seed_worker,
+    set_seed,
+    speed_metrics,
+)
+from transformers.training_args import OptimizerNames, ParallelMode, TrainingArguments
+from transformers.utils import (
+    CONFIG_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    can_return_loss,
+    find_labels,
+    get_full_repo_name,
+    is_accelerate_available,
+    is_apex_available,
+    is_datasets_available,
+    is_in_notebook,
+    is_ipex_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_torch_compile_available,
+    is_torch_neuroncore_available,
+    is_torch_tpu_available,
+    logging,
+)
+from transformers.utils.generic import ContextManagers
+
+
+_is_native_cpu_amp_available = is_torch_greater_or_equal_than_1_10
+
+DEFAULT_CALLBACKS = [DefaultFlowCallback]
+DEFAULT_PROGRESS_CALLBACK = ProgressCallback
+
+if is_in_notebook():
+    from transformers.utils.notebook import NotebookProgressCallback
+
+    DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
+
+if is_apex_available():
+    from apex import amp
+
+if is_datasets_available():
+    import datasets
+
+if is_torch_tpu_available(check_device=False):
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+    import torch_xla.distributed.parallel_loader as pl
+
+if is_fairscale_available():
+    dep_version_check("fairscale")
+    import fairscale
+    from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
+    from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
+    from fairscale.nn.wrap import auto_wrap
+    from fairscale.optim import OSS
+    from fairscale.optim.grad_scaler import ShardedGradScaler
+
+
+if is_sagemaker_mp_enabled():
+    import smdistributed.modelparallel.torch as smp
+    from smdistributed.modelparallel import __version__ as SMP_VERSION
+
+    IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
+
+    from transformers.trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat
+else:
+    IS_SAGEMAKER_MP_POST_1_10 = False
+
+
+skip_first_batches = None
+if is_accelerate_available():
+    from accelerate import __version__ as accelerate_version
+
+    if version.parse(accelerate_version) >= version.parse("0.16"):
+        from accelerate import skip_first_batches
+
+
+if TYPE_CHECKING:
+    import optuna
+
+logger = logging.get_logger(__name__)
+
+
+# Name of the files used for checkpointing
+TRAINING_ARGS_NAME = "training_args.bin"
+TRAINER_STATE_NAME = "trainer_state.json"
+OPTIMIZER_NAME = "optimizer.pt"
+SCHEDULER_NAME = "scheduler.pt"
+SCALER_NAME = "scaler.pt"
+
+
+class Trainer:
+    """
+    Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
+
+    Args:
+        model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
+            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
+
+            <Tip>
+
+            [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
+            your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
+            models.
+
+            </Tip>
+
+        args ([`TrainingArguments`], *optional*):
+            The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
+            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
+        data_collator (`DataCollator`, *optional*):
+            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
+            default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
+            [`DataCollatorWithPadding`] otherwise.
+        train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*):
+            The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
+            `model.forward()` method are automatically removed.
+
+            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
+            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
+            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
+            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
+            sets the seed of the RNGs used.
+        eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*):
+             The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
+             `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
+             dataset prepending the dictionary key to the metric name.
+        tokenizer ([`PreTrainedTokenizerBase`], *optional*):
+            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
+            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
+            interrupted training or reuse the fine-tuned model.
+        model_init (`Callable[[], PreTrainedModel]`, *optional*):
+            A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
+            from a new instance of the model as given by this function.
+
+            The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
+            be able to choose different architectures according to hyper parameters (such as layer count, sizes of
+            inner layers, dropout probabilities etc).
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values.
+        callbacks (List of [`TrainerCallback`], *optional*):
+            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](callback).
+
+            If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
+            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+
+    Important attributes:
+
+        - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
+          subclass.
+        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
+          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
+          the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
+          model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
+        - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
+          data parallelism, this means some of the model layers are split on different GPUs).
+        - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
+          to `False` if model parallel or deepspeed is used, or if the default
+          `TrainingArguments.place_model_on_device` is overridden to return `False` .
+        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
+          in `train`)
+
+    """
+
+    from transformers.trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        args: TrainingArguments = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        save_prefixencoder: bool = False,
+    ):
+        self.save_prefixencoder = save_prefixencoder
+        if args is None:
+            output_dir = "tmp_trainer"
+            logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
+            args = TrainingArguments(output_dir=output_dir)
+        self.args = args
+        # Seed must be set before instantiating the model when using model
+        enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
+        self.hp_name = None
+        self.deepspeed = None
+        self.is_in_train = False
+
+        # memory metrics - must set up as early as possible
+        self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
+        self._memory_tracker.start()
+
+        # set the correct log level depending on the node
+        log_level = args.get_process_log_level()
+        logging.set_verbosity(log_level)
+
+        # force device and distributed setup init explicitly
+        args._setup_devices
+
+        if model is None:
+            if model_init is not None:
+                self.model_init = model_init
+                model = self.call_model_init()
+            else:
+                raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
+        else:
+            if model_init is not None:
+                warnings.warn(
+                    "`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will"
+                    " overwrite your model when calling the `train` method. This will become a fatal error in the next"
+                    " release.",
+                    FutureWarning,
+                )
+            self.model_init = model_init
+
+        if model.__class__.__name__ in MODEL_MAPPING_NAMES:
+            raise ValueError(
+                f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
+                "computes hidden states and does not accept any labels. You should choose a model with a head "
+                "suitable for your task like any of the `AutoModelForXxx` listed at "
+                "https://huggingface.co/docs/transformers/model_doc/auto."
+            )
+
+        if hasattr(model, "is_parallelizable") and model.is_parallelizable and model.model_parallel:
+            self.is_model_parallel = True
+        else:
+            self.is_model_parallel = False
+
+        # At this stage the model is already loaded
+        if getattr(model, "is_loaded_in_8bit", False):
+            if getattr(model, "_is_int8_training_enabled", False):
+                logger.info(
+                    "The model is loaded in 8-bit precision. To train this model you need to add additional modules"
+                    " inside the model such as adapters using `peft` library and freeze the model weights. Please"
+                    " check "
+                    " the examples in https://github.com/huggingface/peft for more details."
+                )
+            else:
+                raise ValueError(
+                    "The model you want to train is loaded in 8-bit precision.  if you want to fine-tune an 8-bit"
+                    " model, please make sure that you have installed `bitsandbytes>=0.37.0`. "
+                )
+
+        # Setup Sharded DDP training
+        self.sharded_ddp = None
+        if len(args.sharded_ddp) > 0:
+            if args.deepspeed:
+                raise ValueError(
+                    "Using --sharded_ddp xxx together with --deepspeed is not possible, deactivate one of those flags."
+                )
+            if len(args.fsdp) > 0:
+                raise ValueError(
+                    "Using --sharded_ddp xxx together with --fsdp is not possible, deactivate one of those flags."
+                )
+
+            if args.local_rank == -1:
+                raise ValueError("Using sharded DDP only works in distributed training.")
+            elif not is_fairscale_available():
+                raise ImportError("Sharded DDP training requires fairscale: `pip install fairscale`.")
+            elif ShardedDDPOption.SIMPLE not in args.sharded_ddp and FullyShardedDDP is None:
+                raise ImportError(
+                    "Sharded DDP in a mode other than simple training requires fairscale version >= 0.3, found "
+                    f"{fairscale.__version__}. Upgrade your fairscale library: `pip install --upgrade fairscale`."
+                )
+            elif ShardedDDPOption.SIMPLE in args.sharded_ddp:
+                self.sharded_ddp = ShardedDDPOption.SIMPLE
+            elif ShardedDDPOption.ZERO_DP_2 in args.sharded_ddp:
+                self.sharded_ddp = ShardedDDPOption.ZERO_DP_2
+            elif ShardedDDPOption.ZERO_DP_3 in args.sharded_ddp:
+                self.sharded_ddp = ShardedDDPOption.ZERO_DP_3
+
+        self.fsdp = None
+        if len(args.fsdp) > 0:
+            if args.deepspeed:
+                raise ValueError(
+                    "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
+                )
+            if not args.fsdp_config["xla"] and args.local_rank == -1:
+                raise ValueError("Using fsdp only works in distributed training.")
+
+            # dep_version_check("torch>=1.12.0")
+            # Would have to update setup.py with torch>=1.12.0
+            # which isn't ideally given that it will force people not using FSDP to also use torch>=1.12.0
+            # below is the current alternative.
+            if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):
+                raise ValueError("FSDP requires PyTorch >= 1.12.0")
+
+            from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy
+
+            if FSDPOption.FULL_SHARD in args.fsdp:
+                self.fsdp = ShardingStrategy.FULL_SHARD
+            elif FSDPOption.SHARD_GRAD_OP in args.fsdp:
+                self.fsdp = ShardingStrategy.SHARD_GRAD_OP
+            elif FSDPOption.NO_SHARD in args.fsdp:
+                self.fsdp = ShardingStrategy.NO_SHARD
+
+            self.backward_prefetch = BackwardPrefetch.BACKWARD_PRE
+            if "backward_prefetch" in self.args.fsdp_config and "backward_pos" not in self.backward_prefetch:
+                self.backward_prefetch = BackwardPrefetch.BACKWARD_POST
+
+            self.forword_prefetch = False
+            if self.args.fsdp_config.get("forword_prefect", False):
+                self.forword_prefetch = True
+
+            self.limit_all_gathers = False
+            if self.args.fsdp_config.get("limit_all_gathers", False):
+                self.limit_all_gathers = True
+
+        # one place to sort out whether to place the model on device or not
+        # postpone switching model to cuda when:
+        # 1. MP - since we are trying to fit a much bigger than 1 gpu model
+        # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
+        #    and we only use deepspeed for training at the moment
+        # 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
+        # 4. Sharded DDP - same as MP
+        # 5. FSDP - same as MP
+        self.place_model_on_device = args.place_model_on_device
+        if (
+            self.is_model_parallel
+            or args.deepspeed
+            or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
+            or (self.sharded_ddp in [ShardedDDPOption.ZERO_DP_2, ShardedDDPOption.ZERO_DP_3])
+            or (self.fsdp is not None)
+        ):
+            self.place_model_on_device = False
+
+        default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
+        self.data_collator = data_collator if data_collator is not None else default_collator
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.tokenizer = tokenizer
+
+        if self.place_model_on_device and not getattr(model, "is_loaded_in_8bit", False):
+            self._move_model_to_device(model, args.device)
+
+        # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
+        if self.is_model_parallel:
+            self.args._n_gpu = 1
+
+        # later use `self.model is self.model_wrapped` to check if it's wrapped or not
+        self.model_wrapped = model
+        self.model = model
+
+        self.compute_metrics = compute_metrics
+        self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
+        self.optimizer, self.lr_scheduler = optimizers
+        if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None):
+            raise RuntimeError(
+                "Passing a `model_init` is incompatible with providing the `optimizers` argument. "
+                "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
+            )
+        if is_torch_tpu_available() and self.optimizer is not None:
+            for param in self.model.parameters():
+                model_device = param.device
+                break
+            for param_group in self.optimizer.param_groups:
+                if len(param_group["params"]) > 0:
+                    optimizer_device = param_group["params"][0].device
+                    break
+            if model_device != optimizer_device:
+                raise ValueError(
+                    "The model and the optimizer parameters are not on the same device, which probably means you"
+                    " created an optimizer around your model **before** putting on the device and passing it to the"
+                    " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
+                    " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
+                )
+        if ((self.sharded_ddp is not None) or args.deepspeed or (self.fsdp is not None)) and (
+            self.optimizer is not None or self.lr_scheduler is not None
+        ):
+            raise RuntimeError(
+                "Passing `optimizers` is not allowed if Fairscale, Deepspeed or PyTorch FSDP is enabled."
+                "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
+            )
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+        callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
+        self.callback_handler = CallbackHandler(
+            callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler
+        )
+        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
+
+        # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
+        self._loggers_initialized = False
+
+        # Create clone of distant repo and output directory if needed
+        if self.args.push_to_hub:
+            self.init_git_repo(at_init=True)
+            # In case of pull, we need to make sure every process has the latest.
+            if is_torch_tpu_available():
+                xm.rendezvous("init git repo")
+            elif args.local_rank != -1:
+                dist.barrier()
+
+        if self.args.should_save:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
+            raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).")
+
+        if args.max_steps > 0:
+            logger.info("max_steps is given, it will override any value given in num_train_epochs")
+
+        if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0:
+            raise ValueError("train_dataset does not implement __len__, max_steps has to be specified")
+
+        if (
+            train_dataset is not None
+            and isinstance(train_dataset, torch.utils.data.IterableDataset)
+            and args.group_by_length
+        ):
+            raise ValueError("the `--group_by_length` option is only available for `Dataset`, not `IterableDataset")
+
+        self._signature_columns = None
+
+        # Mixed precision setup
+        self.use_apex = False
+        self.use_cuda_amp = False
+        self.use_cpu_amp = False
+
+        # Mixed precision setup for SageMaker Model Parallel
+        if is_sagemaker_mp_enabled():
+            # BF16 + model parallelism in SageMaker: currently not supported, raise an error
+            if args.bf16:
+                raise ValueError("SageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead ")
+
+            if IS_SAGEMAKER_MP_POST_1_10:
+                # When there's mismatch between SMP config and trainer argument, use SMP config as truth
+                if args.fp16 != smp.state.cfg.fp16:
+                    logger.warning(
+                        f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16},"
+                        f"but FP16 provided in trainer argument is {args.fp16},"
+                        f"setting to {smp.state.cfg.fp16}"
+                    )
+                    args.fp16 = smp.state.cfg.fp16
+            else:
+                # smp < 1.10 does not support fp16 in trainer.
+                if hasattr(smp.state.cfg, "fp16"):
+                    logger.warning(
+                        f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
+                        "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer."
+                    )
+
+        if args.fp16 or args.bf16:
+            if args.half_precision_backend == "auto":
+                if args.device == torch.device("cpu"):
+                    if args.fp16:
+                        raise ValueError("Tried to use `fp16` but it is not supported on cpu")
+                    elif _is_native_cpu_amp_available:
+                        args.half_precision_backend = "cpu_amp"
+                    else:
+                        raise ValueError("Tried to use cpu amp but native cpu amp is not available")
+                else:
+                    args.half_precision_backend = "cuda_amp"
+
+            logger.info(f"Using {args.half_precision_backend} half precision backend")
+
+        self.do_grad_scaling = False
+        if (args.fp16 or args.bf16) and not (args.deepspeed or is_sagemaker_mp_enabled() or is_torch_tpu_available()):
+            # deepspeed and SageMaker Model Parallel manage their own half precision
+            if args.half_precision_backend == "cuda_amp":
+                self.use_cuda_amp = True
+                self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16
+                #  bf16 does not need grad scaling
+                self.do_grad_scaling = self.amp_dtype == torch.float16
+                if self.do_grad_scaling:
+                    if self.sharded_ddp is not None:
+                        self.scaler = ShardedGradScaler()
+                    elif self.fsdp is not None:
+                        from torch.distributed.fsdp.sharded_grad_scaler import (
+                            ShardedGradScaler as FSDPShardedGradScaler,
+                        )
+
+                        self.scaler = FSDPShardedGradScaler()
+                    elif is_torch_tpu_available():
+                        from torch_xla.amp import GradScaler
+
+                        self.scaler = GradScaler()
+                    else:
+                        self.scaler = torch.cuda.amp.GradScaler()
+            elif args.half_precision_backend == "cpu_amp":
+                self.use_cpu_amp = True
+                self.amp_dtype = torch.bfloat16
+            else:
+                if not is_apex_available():
+                    raise ImportError(
+                        "Using FP16 with APEX but APEX is not installed, please refer to"
+                        " https://www.github.com/nvidia/apex."
+                    )
+                self.use_apex = True
+
+        # FP16 + model parallelism in SageMaker: gradient clipping does not work for now so we raise a helpful error.
+        if (
+            is_sagemaker_mp_enabled()
+            and self.use_cuda_amp
+            and args.max_grad_norm is not None
+            and args.max_grad_norm > 0
+        ):
+            raise ValueError(
+                "SageMaker Model Parallelism in mixed precision mode does not support gradient clipping yet. Pass "
+                "along 'max_grad_norm': 0 in your hyperparameters."
+            )
+
+        # Label smoothing
+        if self.args.label_smoothing_factor != 0:
+            self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
+        else:
+            self.label_smoother = None
+
+        self.state = TrainerState(
+            is_local_process_zero=self.is_local_process_zero(),
+            is_world_process_zero=self.is_world_process_zero(),
+        )
+
+        self.control = TrainerControl()
+        # Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then
+        # returned to 0 every time flos need to be logged
+        self.current_flos = 0
+        self.hp_search_backend = None
+        self.use_tune_checkpoints = False
+        default_label_names = find_labels(self.model.__class__)
+        self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
+        self.can_return_loss = can_return_loss(self.model.__class__)
+        self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
+
+        # Internal variables to keep track of the original batch size
+        self._train_batch_size = args.train_batch_size
+
+        # very last
+        self._memory_tracker.stop_and_update_metrics()
+
+        # torch.compile
+        if args.torch_compile and not is_torch_compile_available():
+            raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
+
+    def add_callback(self, callback):
+        """
+        Add a callback to the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will instantiate a member of that class.
+        """
+        self.callback_handler.add_callback(callback)
+
+    def pop_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
+
+        If the callback is not found, returns `None` (and no error is raised).
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will pop the first member of that class found in the list of callbacks.
+
+        Returns:
+            [`~transformer.TrainerCallback`]: The callback removed, if found.
+        """
+        return self.callback_handler.pop_callback(callback)
+
+    def remove_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will remove the first member of that class found in the list of callbacks.
+        """
+        self.callback_handler.remove_callback(callback)
+
+    def _move_model_to_device(self, model, device):
+        model = model.to(device)
+        # Moving a model to an XLA device disconnects the tied weights, so we have to retie them.
+        if self.args.parallel_mode == ParallelMode.TPU and hasattr(model, "tie_weights"):
+            model.tie_weights()
+
+    def _set_signature_columns_if_needed(self):
+        if self._signature_columns is None:
+            # Inspect model forward signature to keep only the arguments it accepts.
+            signature = inspect.signature(self.model.forward)
+            self._signature_columns = list(signature.parameters.keys())
+            # Labels may be named label or label_ids, the default data collator handles that.
+            self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
+
+    def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
+        if not self.args.remove_unused_columns:
+            return dataset
+        self._set_signature_columns_if_needed()
+        signature_columns = self._signature_columns
+
+        ignored_columns = list(set(dataset.column_names) - set(signature_columns))
+        if len(ignored_columns) > 0:
+            dset_description = "" if description is None else f"in the {description} set"
+            logger.info(
+                f"The following columns {dset_description} don't have a corresponding argument in "
+                f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
+                f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, "
+                " you can safely ignore this message."
+            )
+
+        columns = [k for k in signature_columns if k in dataset.column_names]
+
+        if version.parse(datasets.__version__) < version.parse("1.4.0"):
+            dataset.set_format(
+                type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"]
+            )
+            return dataset
+        else:
+            return dataset.remove_columns(ignored_columns)
+
+    def _get_collator_with_removed_columns(
+        self, data_collator: Callable, description: Optional[str] = None
+    ) -> Callable:
+        """Wrap the data collator in a callable removing unused columns."""
+        if not self.args.remove_unused_columns:
+            return data_collator
+        self._set_signature_columns_if_needed()
+        signature_columns = self._signature_columns
+
+        remove_columns_collator = RemoveColumnsCollator(
+            data_collator=data_collator,
+            signature_columns=signature_columns,
+            logger=logger,
+            description=description,
+            model_name=self.model.__class__.__name__,
+        )
+        return remove_columns_collator
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        generator = None
+        if self.args.world_size <= 1:
+            generator = torch.Generator()
+            # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with
+            # `args.seed`) if data_seed isn't provided.
+            # Further on in this method, we default to `args.seed` instead.
+            if self.args.data_seed is None:
+                seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            else:
+                seed = self.args.data_seed
+            generator.manual_seed(seed)
+
+        seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
+
+        # Build the sampler.
+        if self.args.group_by_length:
+            if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
+                lengths = (
+                    self.train_dataset[self.args.length_column_name]
+                    if self.args.length_column_name in self.train_dataset.column_names
+                    else None
+                )
+            else:
+                lengths = None
+            model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
+            if self.args.world_size <= 1:
+                return LengthGroupedSampler(
+                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                    dataset=self.train_dataset,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    generator=generator,
+                )
+            else:
+                return DistributedLengthGroupedSampler(
+                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                    dataset=self.train_dataset,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    seed=seed,
+                )
+
+        else:
+            if self.args.world_size <= 1:
+                return RandomSampler(self.train_dataset, generator=generator)
+            elif (
+                self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
+                and not self.args.dataloader_drop_last
+            ):
+                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
+                return DistributedSamplerWithLoop(
+                    self.train_dataset,
+                    batch_size=self.args.per_device_train_batch_size,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=seed,
+                )
+            else:
+                return DistributedSampler(
+                    self.train_dataset,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=seed,
+                )
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
+        training if necessary) otherwise.
+
+        Subclass and override this method if you want to inject some custom behavior.
+        """
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+
+        if isinstance(train_dataset, torch.utils.data.IterableDataset):
+            if self.args.world_size > 1:
+                train_dataset = IterableDatasetShard(
+                    train_dataset,
+                    batch_size=self._train_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.world_size,
+                    process_index=self.args.process_index,
+                )
+
+            return DataLoader(
+                train_dataset,
+                batch_size=self._train_batch_size,
+                collate_fn=data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                pin_memory=self.args.dataloader_pin_memory,
+            )
+
+        train_sampler = self._get_train_sampler()
+
+        return DataLoader(
+            train_dataset,
+            batch_size=self._train_batch_size,
+            sampler=train_sampler,
+            collate_fn=data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+            worker_init_fn=seed_worker,
+        )
+
+    def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
+        # Deprecated code
+        if self.args.use_legacy_prediction_loop:
+            if is_torch_tpu_available():
+                return SequentialDistributedSampler(
+                    eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()
+                )
+            elif is_sagemaker_mp_enabled():
+                return SequentialDistributedSampler(
+                    eval_dataset,
+                    num_replicas=smp.dp_size(),
+                    rank=smp.dp_rank(),
+                    batch_size=self.args.per_device_eval_batch_size,
+                )
+            elif self.args.local_rank != -1:
+                return SequentialDistributedSampler(eval_dataset)
+            else:
+                return SequentialSampler(eval_dataset)
+
+        if self.args.world_size <= 1:
+            return SequentialSampler(eval_dataset)
+        else:
+            return ShardSampler(
+                eval_dataset,
+                batch_size=self.args.per_device_eval_batch_size,
+                num_processes=self.args.world_size,
+                process_index=self.args.process_index,
+            )
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        data_collator = self.data_collator
+
+        if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
+            eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")
+
+        if isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            if self.args.world_size > 1:
+                eval_dataset = IterableDatasetShard(
+                    eval_dataset,
+                    batch_size=self.args.per_device_eval_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.world_size,
+                    process_index=self.args.process_index,
+                )
+            return DataLoader(
+                eval_dataset,
+                batch_size=self.args.eval_batch_size,
+                collate_fn=data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                pin_memory=self.args.dataloader_pin_memory,
+            )
+
+        eval_sampler = self._get_eval_sampler(eval_dataset)
+
+        return DataLoader(
+            eval_dataset,
+            sampler=eval_sampler,
+            batch_size=self.args.eval_batch_size,
+            collate_fn=data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+        )
+
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        """
+        Returns the test [`~torch.utils.data.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            test_dataset (`torch.utils.data.Dataset`, *optional*):
+                The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
+                `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        data_collator = self.data_collator
+
+        if is_datasets_available() and isinstance(test_dataset, datasets.Dataset):
+            test_dataset = self._remove_unused_columns(test_dataset, description="test")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="test")
+
+        if isinstance(test_dataset, torch.utils.data.IterableDataset):
+            if self.args.world_size > 1:
+                test_dataset = IterableDatasetShard(
+                    test_dataset,
+                    batch_size=self.args.eval_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.world_size,
+                    process_index=self.args.process_index,
+                )
+            return DataLoader(
+                test_dataset,
+                batch_size=self.args.eval_batch_size,
+                collate_fn=data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                pin_memory=self.args.dataloader_pin_memory,
+            )
+
+        test_sampler = self._get_eval_sampler(test_dataset)
+
+        # We use the same batch_size as for eval.
+        return DataLoader(
+            test_dataset,
+            sampler=test_sampler,
+            batch_size=self.args.eval_batch_size,
+            collate_fn=data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+        )
+
+    def create_optimizer_and_scheduler(self, num_training_steps: int):
+        """
+        Setup the optimizer and the learning rate scheduler.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
+        `create_scheduler`) in a subclass.
+        """
+        self.create_optimizer()
+        if IS_SAGEMAKER_MP_POST_1_10 and smp.state.cfg.fp16:
+            # If smp >= 1.10 and fp16 is enabled, we unwrap the optimizer
+            optimizer = self.optimizer.optimizer
+        else:
+            optimizer = self.optimizer
+        self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            optimizer_grouped_parameters = [
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": self.args.weight_decay,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": 0.0,
+                },
+            ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            if self.sharded_ddp == ShardedDDPOption.SIMPLE:
+                self.optimizer = OSS(
+                    params=optimizer_grouped_parameters,
+                    optim=optimizer_cls,
+                    **optimizer_kwargs,
+                )
+            else:
+                self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+                if optimizer_cls.__name__ == "Adam8bit":
+                    import bitsandbytes
+
+                    manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                    skipped = 0
+                    for module in opt_model.modules():
+                        if isinstance(module, nn.Embedding):
+                            skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                            print(f"skipped {module}: {skipped/2**20}M params")
+                            manager.register_module_override(module, "weight", {"optim_bits": 32})
+                            logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                    print(f"skipped: {skipped/2**20}M params")
+
+        if is_sagemaker_mp_enabled():
+            self.optimizer = smp.DistributedOptimizer(self.optimizer)
+
+        return self.optimizer
+
+    @staticmethod
+    def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
+        """
+        Returns the optimizer class and optimizer parameters based on the training arguments.
+
+        Args:
+            args (`transformers.training_args.TrainingArguments`):
+                The training arguments for the training session.
+
+        """
+
+        # parse args.optim_args
+        optim_args = {}
+        if args.optim_args:
+            for mapping in args.optim_args.replace(" ", "").split(","):
+                key, value = mapping.split("=")
+                optim_args[key] = value
+
+        optimizer_kwargs = {"lr": args.learning_rate}
+
+        adam_kwargs = {
+            "betas": (args.adam_beta1, args.adam_beta2),
+            "eps": args.adam_epsilon,
+        }
+        if args.optim == OptimizerNames.ADAFACTOR:
+            optimizer_cls = Adafactor
+            optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
+        elif args.optim == OptimizerNames.ADAMW_HF:
+            from transformers.optimization import AdamW
+
+            optimizer_cls = AdamW
+            optimizer_kwargs.update(adam_kwargs)
+        elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]:
+            from torch.optim import AdamW
+
+            optimizer_cls = AdamW
+            optimizer_kwargs.update(adam_kwargs)
+            if args.optim == OptimizerNames.ADAMW_TORCH_FUSED:
+                optimizer_kwargs.update({"fused": True})
+        elif args.optim == OptimizerNames.ADAMW_TORCH_XLA:
+            try:
+                from torch_xla.amp.syncfree import AdamW
+
+                optimizer_cls = AdamW
+                optimizer_kwargs.update(adam_kwargs)
+            except ImportError:
+                raise ValueError("Trainer failed to import syncfree AdamW from torch_xla.")
+        elif args.optim == OptimizerNames.ADAMW_APEX_FUSED:
+            try:
+                from apex.optimizers import FusedAdam
+
+                optimizer_cls = FusedAdam
+                optimizer_kwargs.update(adam_kwargs)
+            except ImportError:
+                raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
+        elif args.optim == OptimizerNames.ADAMW_BNB:
+            try:
+                from bitsandbytes.optim import Adam8bit
+
+                optimizer_cls = Adam8bit
+                optimizer_kwargs.update(adam_kwargs)
+            except ImportError:
+                raise ValueError("Trainer tried to instantiate bnb Adam8bit but bnb is not installed!")
+        elif args.optim == OptimizerNames.ADAMW_ANYPRECISION:
+            try:
+                from torchdistx.optimizers import AnyPrecisionAdamW
+
+                optimizer_cls = AnyPrecisionAdamW
+                optimizer_kwargs.update(adam_kwargs)
+
+                # TODO Change dtypes back to M=FP32, Var = BF16, Kahan = False once they can be cast together in torchdistx.
+                optimizer_kwargs.update(
+                    {
+                        "use_kahan_summation": strtobool(optim_args.get("use_kahan_summation", "False")),
+                        "momentum_dtype": getattr(torch, optim_args.get("momentum_dtype", "float32")),
+                        "variance_dtype": getattr(torch, optim_args.get("variance_dtype", "float32")),
+                        "compensation_buffer_dtype": getattr(
+                            torch, optim_args.get("compensation_buffer_dtype", "bfloat16")
+                        ),
+                    }
+                )
+            except ImportError:
+                raise ValueError("Please install https://github.com/pytorch/torchdistx")
+        elif args.optim == OptimizerNames.SGD:
+            optimizer_cls = torch.optim.SGD
+        elif args.optim == OptimizerNames.ADAGRAD:
+            optimizer_cls = torch.optim.Adagrad
+        else:
+            raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
+        return optimizer_cls, optimizer_kwargs
+
+    def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+        """
+        if self.lr_scheduler is None:
+            self.lr_scheduler = get_scheduler(
+                self.args.lr_scheduler_type,
+                optimizer=self.optimizer if optimizer is None else optimizer,
+                num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                num_training_steps=num_training_steps,
+            )
+        return self.lr_scheduler
+
+    def num_examples(self, dataloader: DataLoader) -> int:
+        """
+        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
+        dataloader.dataset does not exist or has no length, estimates as best it can
+        """
+        try:
+            dataset = dataloader.dataset
+            # Special case for IterableDatasetShard, we need to dig deeper
+            if isinstance(dataset, IterableDatasetShard):
+                return len(dataloader.dataset.dataset)
+            return len(dataloader.dataset)
+        except (NameError, AttributeError, TypeError):  # no dataset or length, estimate by length of dataloader
+            return len(dataloader) * self.args.per_device_train_batch_size
+
+    def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
+        """HP search setup code"""
+        self._trial = trial
+
+        if self.hp_search_backend is None or trial is None:
+            return
+        if self.hp_search_backend == HPSearchBackend.OPTUNA:
+            params = self.hp_space(trial)
+        elif self.hp_search_backend == HPSearchBackend.RAY:
+            params = trial
+            params.pop("wandb", None)
+        elif self.hp_search_backend == HPSearchBackend.SIGOPT:
+            params = {k: int(v) if isinstance(v, str) else v for k, v in trial.assignments.items()}
+        elif self.hp_search_backend == HPSearchBackend.WANDB:
+            params = trial
+
+        for key, value in params.items():
+            if not hasattr(self.args, key):
+                logger.warning(
+                    f"Trying to set {key} in the hyperparameter search but there is no corresponding field in"
+                    " `TrainingArguments`."
+                )
+                continue
+            old_attr = getattr(self.args, key, None)
+            # Casting value to the proper type
+            if old_attr is not None:
+                value = type(old_attr)(value)
+            setattr(self.args, key, value)
+        if self.hp_search_backend == HPSearchBackend.OPTUNA:
+            logger.info(f"Trial: {trial.params}")
+        if self.hp_search_backend == HPSearchBackend.SIGOPT:
+            logger.info(f"SigOpt Assignments: {trial.assignments}")
+        if self.hp_search_backend == HPSearchBackend.WANDB:
+            logger.info(f"W&B Sweep parameters: {trial}")
+        if self.args.deepspeed:
+            # Rebuild the deepspeed config to reflect the updated training parameters
+            from transformers.deepspeed import HfTrainerDeepSpeedConfig
+
+            self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
+            self.args.hf_deepspeed_config.trainer_config_process(self.args)
+
+    def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
+        if self.hp_search_backend is None or trial is None:
+            return
+        self.objective = self.compute_objective(metrics.copy())
+        if self.hp_search_backend == HPSearchBackend.OPTUNA:
+            import optuna
+
+            trial.report(self.objective, step)
+            if trial.should_prune():
+                self.callback_handler.on_train_end(self.args, self.state, self.control)
+                raise optuna.TrialPruned()
+        elif self.hp_search_backend == HPSearchBackend.RAY:
+            from ray import tune
+
+            if self.control.should_save:
+                self._tune_save_checkpoint()
+            tune.report(objective=self.objective, **metrics)
+
+    def _tune_save_checkpoint(self):
+        from ray import tune
+
+        if not self.use_tune_checkpoints:
+            return
+        with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
+            output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
+            self.save_model(output_dir, _internal_call=True)
+            if self.args.should_save:
+                self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+
+    def call_model_init(self, trial=None):
+        model_init_argcount = number_of_arguments(self.model_init)
+        if model_init_argcount == 0:
+            model = self.model_init()
+        elif model_init_argcount == 1:
+            model = self.model_init(trial)
+        else:
+            raise RuntimeError("model_init should have 0 or 1 argument.")
+
+        if model is None:
+            raise RuntimeError("model_init should not return None.")
+
+        return model
+
+    def torch_jit_model_eval(self, model, dataloader, training=False):
+        if not training:
+            if dataloader is None:
+                logger.warning("failed to use PyTorch jit mode due to current dataloader is none.")
+                return model
+            example_batch = next(iter(dataloader))
+            example_batch = self._prepare_inputs(example_batch)
+            try:
+                jit_model = model.eval()
+                with ContextManagers([self.autocast_smart_context_manager(cache_enabled=False), torch.no_grad()]):
+                    if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.14.0"):
+                        if isinstance(example_batch, dict):
+                            jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False)
+                        else:
+                            jit_model = torch.jit.trace(
+                                jit_model,
+                                example_kwarg_inputs={key: example_batch[key] for key in example_batch},
+                                strict=False,
+                            )
+                    else:
+                        jit_inputs = []
+                        for key in example_batch:
+                            example_tensor = torch.ones_like(example_batch[key])
+                            jit_inputs.append(example_tensor)
+                        jit_inputs = tuple(jit_inputs)
+                        jit_model = torch.jit.trace(jit_model, jit_inputs, strict=False)
+                jit_model = torch.jit.freeze(jit_model)
+                with torch.no_grad():
+                    jit_model(**example_batch)
+                    jit_model(**example_batch)
+                model = jit_model
+                self.use_cpu_amp = False
+                self.use_cuda_amp = False
+            except (RuntimeError, TypeError, ValueError, NameError, IndexError) as e:
+                logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
+
+        return model
+
+    def ipex_optimize_model(self, model, training=False, dtype=torch.float32):
+        if not is_ipex_available():
+            raise ImportError(
+                "Using IPEX but IPEX is not installed or IPEX's version does not match current PyTorch, please refer"
+                " to https://github.com/intel/intel-extension-for-pytorch."
+            )
+
+        import intel_extension_for_pytorch as ipex
+
+        if not training:
+            model.eval()
+            dtype = torch.bfloat16 if not self.is_in_train and self.args.bf16_full_eval else dtype
+            # conv_bn_folding is disabled as it fails in symbolic tracing, resulting in ipex warnings
+            model = ipex.optimize(model, dtype=dtype, level="O1", conv_bn_folding=False, inplace=not self.is_in_train)
+        else:
+            if not model.training:
+                model.train()
+            model, self.optimizer = ipex.optimize(
+                model, dtype=dtype, optimizer=self.optimizer, inplace=True, level="O1"
+            )
+
+        return model
+
+    def _wrap_model(self, model, training=True, dataloader=None):
+        if self.args.torch_compile:
+            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
+
+        if self.args.use_ipex:
+            dtype = torch.bfloat16 if self.use_cpu_amp else torch.float32
+            model = self.ipex_optimize_model(model, training, dtype=dtype)
+
+        if is_sagemaker_mp_enabled():
+            # Wrapping the base model twice in a DistributedModel will raise an error.
+            if isinstance(self.model_wrapped, smp.model.DistributedModel):
+                return self.model_wrapped
+            return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
+
+        # already initialized its own DDP and AMP
+        if self.deepspeed:
+            return self.deepspeed
+
+        # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
+        if unwrap_model(model) is not model:
+            return model
+
+        # Mixed precision training with apex (torch < 1.6)
+        if self.use_apex and training:
+            model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)
+
+        # Multi-gpu training (should be after apex fp16 initialization)
+        if self.args.n_gpu > 1:
+            model = nn.DataParallel(model)
+
+        if self.args.jit_mode_eval:
+            start_time = time.time()
+            model = self.torch_jit_model_eval(model, dataloader, training)
+            self.jit_compilation_time = round(time.time() - start_time, 4)
+
+        # Note: in torch.distributed mode, there's no point in wrapping the model
+        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
+        if not training:
+            return model
+
+        # Distributed training (should be after apex fp16 initialization)
+        if self.sharded_ddp is not None:
+            # Sharded DDP!
+            if self.sharded_ddp == ShardedDDPOption.SIMPLE:
+                model = ShardedDDP(model, self.optimizer)
+            else:
+                mixed_precision = self.args.fp16 or self.args.bf16
+                cpu_offload = ShardedDDPOption.OFFLOAD in self.args.sharded_ddp
+                zero_3 = self.sharded_ddp == ShardedDDPOption.ZERO_DP_3
+                # XXX: Breaking the self.model convention but I see no way around it for now.
+                if ShardedDDPOption.AUTO_WRAP in self.args.sharded_ddp:
+                    model = auto_wrap(model)
+                self.model = model = FullyShardedDDP(
+                    model,
+                    mixed_precision=mixed_precision,
+                    reshard_after_forward=zero_3,
+                    cpu_offload=cpu_offload,
+                ).to(self.args.device)
+        # Distributed training using PyTorch FSDP
+        elif self.fsdp is not None:
+            if not self.args.fsdp_config["xla"]:
+                # PyTorch FSDP!
+                from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+                from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
+
+                if FSDPOption.OFFLOAD in self.args.fsdp:
+                    cpu_offload = CPUOffload(offload_params=True)
+                else:
+                    cpu_offload = CPUOffload(offload_params=False)
+
+                auto_wrap_policy = None
+
+                if FSDPOption.AUTO_WRAP in self.args.fsdp:
+                    if self.args.fsdp_config["fsdp_min_num_params"] > 0:
+                        auto_wrap_policy = functools.partial(
+                            size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
+                        )
+                    elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+                        transformer_cls_to_wrap = set()
+                        for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
+                            transformer_cls = get_module_class_from_name(model, layer_class)
+                            if transformer_cls is None:
+                                raise Exception("Could not find the transformer layer class to wrap in the model.")
+                            else:
+                                transformer_cls_to_wrap.add(transformer_cls)
+                        auto_wrap_policy = functools.partial(
+                            transformer_auto_wrap_policy,
+                            # Transformer layer class to wrap
+                            transformer_layer_cls=transformer_cls_to_wrap,
+                        )
+                mixed_precision_policy = None
+                dtype = None
+                if self.args.fp16:
+                    dtype = torch.float16
+                elif self.args.bf16:
+                    dtype = torch.bfloat16
+                if dtype is not None:
+                    mixed_precision_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=dtype)
+                if type(model) != FSDP:
+                    # XXX: Breaking the self.model convention but I see no way around it for now.
+                    self.model = model = FSDP(
+                        model,
+                        sharding_strategy=self.fsdp,
+                        cpu_offload=cpu_offload,
+                        auto_wrap_policy=auto_wrap_policy,
+                        mixed_precision=mixed_precision_policy,
+                        device_id=self.args.device,
+                        backward_prefetch=self.backward_prefetch,
+                        forward_prefetch=self.forword_prefetch,
+                        limit_all_gathers=self.limit_all_gathers,
+                    )
+            else:
+                try:
+                    from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
+                    from torch_xla.distributed.fsdp import checkpoint_module
+                    from torch_xla.distributed.fsdp.wrap import (
+                        size_based_auto_wrap_policy,
+                        transformer_auto_wrap_policy,
+                    )
+                except ImportError:
+                    raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
+                auto_wrap_policy = None
+                auto_wrapper_callable = None
+                if self.args.fsdp_config["fsdp_min_num_params"] > 0:
+                    auto_wrap_policy = functools.partial(
+                        size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
+                    )
+                elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+                    transformer_cls_to_wrap = set()
+                    for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
+                        transformer_cls = get_module_class_from_name(model, layer_class)
+                        if transformer_cls is None:
+                            raise Exception("Could not find the transformer layer class to wrap in the model.")
+                        else:
+                            transformer_cls_to_wrap.add(transformer_cls)
+                    auto_wrap_policy = functools.partial(
+                        transformer_auto_wrap_policy,
+                        # Transformer layer class to wrap
+                        transformer_layer_cls=transformer_cls_to_wrap,
+                    )
+                fsdp_kwargs = self.args.xla_fsdp_config
+                if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
+                    # Apply gradient checkpointing to auto-wrapped sub-modules if specified
+                    def auto_wrapper_callable(m, *args, **kwargs):
+                        return FSDP(checkpoint_module(m), *args, **kwargs)
+
+                # Wrap the base model with an outer FSDP wrapper
+                self.model = model = FSDP(
+                    model,
+                    auto_wrap_policy=auto_wrap_policy,
+                    auto_wrapper_callable=auto_wrapper_callable,
+                    **fsdp_kwargs,
+                )
+
+                # Patch `xm.optimizer_step` should not reduce gradients in this case,
+                # as FSDP does not need gradient reduction over sharded parameters.
+                def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
+                    loss = optimizer.step(**optimizer_args)
+                    if barrier:
+                        xm.mark_step()
+                    return loss
+
+                xm.optimizer_step = patched_optimizer_step
+        elif is_sagemaker_dp_enabled():
+            model = nn.parallel.DistributedDataParallel(
+                model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
+            )
+        elif self.args.local_rank != -1:
+            kwargs = {}
+            if self.args.ddp_find_unused_parameters is not None:
+                kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
+            elif isinstance(model, PreTrainedModel):
+                # find_unused_parameters breaks checkpointing as per
+                # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
+                kwargs["find_unused_parameters"] = not model.is_gradient_checkpointing
+            else:
+                kwargs["find_unused_parameters"] = True
+
+            if self.args.ddp_bucket_cap_mb is not None:
+                kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
+            if is_torch_neuroncore_available():
+                return model
+            model = nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.args.local_rank] if self.args._n_gpu != 0 else None,
+                output_device=self.args.local_rank if self.args._n_gpu != 0 else None,
+                **kwargs,
+            )
+
+        return model
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[Union[str, bool]] = None,
+        trial: Union["optuna.Trial", Dict[str, Any]] = None,
+        ignore_keys_for_eval: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Main training entry point.
+
+        Args:
+            resume_from_checkpoint (`str` or `bool`, *optional*):
+                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
+                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
+                of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+            ignore_keys_for_eval (`List[str]`, *optional*)
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions for evaluation during the training.
+            kwargs:
+                Additional keyword arguments used to hide deprecated arguments
+        """
+        if resume_from_checkpoint is False:
+            resume_from_checkpoint = None
+
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        args = self.args
+
+        self.is_in_train = True
+
+        # do_train is not a reliable argument, as it might not be set and .train() still called, so
+        # the following is a workaround:
+        if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
+            self._move_model_to_device(self.model, args.device)
+
+        if "model_path" in kwargs:
+            resume_from_checkpoint = kwargs.pop("model_path")
+            warnings.warn(
+                "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` "
+                "instead.",
+                FutureWarning,
+            )
+        if len(kwargs) > 0:
+            raise TypeError(f"train() received got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}.")
+        # This might change the seed so needs to run first.
+        self._hp_search_setup(trial)
+        self._train_batch_size = self.args.train_batch_size
+
+        # Model re-init
+        model_reloaded = False
+        if self.model_init is not None:
+            # Seed must be set before instantiating the model when using model_init.
+            enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
+            self.model = self.call_model_init(trial)
+            model_reloaded = True
+            # Reinitializes optimizer and scheduler
+            self.optimizer, self.lr_scheduler = None, None
+
+        # Load potential model checkpoint
+        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
+            resume_from_checkpoint = get_last_checkpoint(args.output_dir)
+            if resume_from_checkpoint is None:
+                raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
+
+        if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and args.deepspeed is None:
+            self._load_from_checkpoint(resume_from_checkpoint)
+
+        # If model was re-initialized, put it on the right device and update self.model_wrapped
+        if model_reloaded:
+            if self.place_model_on_device:
+                self._move_model_to_device(self.model, args.device)
+            self.model_wrapped = self.model
+
+        inner_training_loop = find_executable_batch_size(
+            self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
+        )
+        return inner_training_loop(
+            args=args,
+            resume_from_checkpoint=resume_from_checkpoint,
+            trial=trial,
+            ignore_keys_for_eval=ignore_keys_for_eval,
+        )
+
+    def _inner_training_loop(
+        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
+    ):
+        self._train_batch_size = batch_size
+        # Data loader and number of training steps
+        train_dataloader = self.get_train_dataloader()
+
+        # Setting up training control variables:
+        # number of training epochs: num_train_epochs
+        # number of training steps per epoch: num_update_steps_per_epoch
+        # total number of training steps to execute: max_steps
+        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+        len_dataloader = None
+        if has_length(train_dataloader):
+            len_dataloader = len(train_dataloader)
+            num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
+            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+            num_examples = self.num_examples(train_dataloader)
+            if args.max_steps > 0:
+                max_steps = args.max_steps
+                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
+                    args.max_steps % num_update_steps_per_epoch > 0
+                )
+                # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
+                # the best we can do.
+                num_train_samples = args.max_steps * total_train_batch_size
+            else:
+                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
+                num_train_epochs = math.ceil(args.num_train_epochs)
+                num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
+            max_steps = args.max_steps
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_train_epochs = sys.maxsize
+            num_update_steps_per_epoch = max_steps
+            num_examples = total_train_batch_size * args.max_steps
+            num_train_samples = args.max_steps * total_train_batch_size
+        else:
+            raise ValueError(
+                "args.max_steps must be set to a positive value if dataloader does not have a length, was"
+                f" {args.max_steps}"
+            )
+
+        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
+            if self.args.n_gpu > 1:
+                # nn.DataParallel(model) replicates the model, creating new variables and module
+                # references registered here no longer work on other gpus, breaking the module
+                raise ValueError(
+                    "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
+                    " (torch.distributed.launch)."
+                )
+            else:
+                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
+
+        delay_optimizer_creation = (
+            self.sharded_ddp is not None
+            and self.sharded_ddp != ShardedDDPOption.SIMPLE
+            or is_sagemaker_mp_enabled()
+            or self.fsdp is not None
+        )
+        if args.deepspeed:
+            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
+                self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
+            )
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
+            self.optimizer = optimizer
+            self.lr_scheduler = lr_scheduler
+        elif not delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        self.state = TrainerState()
+        self.state.is_hyper_param_search = trial is not None
+
+        # Activate gradient checkpointing if needed
+        if args.gradient_checkpointing:
+            self.model.gradient_checkpointing_enable()
+
+        model = self._wrap_model(self.model_wrapped)
+
+        if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
+            self._load_from_checkpoint(resume_from_checkpoint, model)
+
+        # for the rest of this function `model` is the outside model, whether it was wrapped or not
+        if model is not self.model:
+            self.model_wrapped = model
+
+        if delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # Check if saved optimizer or scheduler states exist
+        self._load_optimizer_and_scheduler(resume_from_checkpoint)
+
+        # important: at this point:
+        # self.model         is the Transformers Model
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Num Epochs = {num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {max_steps}")
+        logger.info(
+            f"  Number of trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
+        )
+
+        self.state.epoch = 0
+        start_time = time.time()
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        # Check if continuing training from a checkpoint
+        if resume_from_checkpoint is not None and os.path.isfile(
+            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
+        ):
+            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            if not args.ignore_data_skip:
+                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
+                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
+            else:
+                steps_trained_in_current_epoch = 0
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info(f"  Continuing training from epoch {epochs_trained}")
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
+            if not args.ignore_data_skip:
+                if skip_first_batches is None:
+                    logger.info(
+                        f"  Will skip the first {epochs_trained} epochs then the first"
+                        f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time,"
+                        " you can install the latest version of Accelerate with `pip install -U accelerate`.You can"
+                        " also add the `--ignore_data_skip` flag to your launch command, but you will resume the"
+                        " training on data already seen by your model."
+                    )
+                else:
+                    logger.info(
+                        f"  Will skip the first {epochs_trained} epochs then the first"
+                        f" {steps_trained_in_current_epoch} batches in the first epoch."
+                    )
+                if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None:
+                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+                    steps_trained_progress_bar.set_description("Skipping the first batches")
+
+        # Update the references
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+        if self.hp_name is not None and self._trial is not None:
+            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
+            # parameter to Train when using DDP.
+            self.state.trial_name = self.hp_name(self._trial)
+        if trial is not None:
+            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
+            self.state.trial_params = hp_params(assignments)
+        else:
+            self.state.trial_params = None
+        # This should be the same if the state has been saved but in case the training arguments changed, it's safer
+        # to set this after the load.
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        # tr_loss is a tensor to avoid synchronization of TPUs through .item()
+        tr_loss = torch.tensor(0.0).to(args.device)
+        # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+        model.zero_grad()
+
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
+        if not args.ignore_data_skip:
+            for epoch in range(epochs_trained):
+                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
+                    train_dataloader.sampler, RandomSampler
+                )
+                if is_torch_less_than_1_11 or not is_random_sampler:
+                    # We just need to begin an iteration to create the randomization of the sampler.
+                    # That was before PyTorch 1.11 however...
+                    for _ in train_dataloader:
+                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    _ = list(train_dataloader.sampler)
+
+        total_batched_samples = 0
+        for epoch in range(epochs_trained, num_train_epochs):
+            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
+                train_dataloader.sampler.set_epoch(epoch)
+            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
+                train_dataloader.dataset.set_epoch(epoch)
+
+            if is_torch_tpu_available():
+                parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
+                epoch_iterator = parallel_loader
+            else:
+                epoch_iterator = train_dataloader
+
+            # Reset the past mems state at the beginning of each epoch if necessary.
+            if args.past_index >= 0:
+                self._past = None
+
+            steps_in_epoch = (
+                len(epoch_iterator)
+                if len_dataloader is not None
+                else args.max_steps * args.gradient_accumulation_steps
+            )
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+
+            if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
+                self._load_rng_state(resume_from_checkpoint)
+
+            rng_to_sync = False
+            steps_skipped = 0
+            if skip_first_batches is not None and steps_trained_in_current_epoch > 0:
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                steps_skipped = steps_trained_in_current_epoch
+                steps_trained_in_current_epoch = 0
+                rng_to_sync = True
+
+            step = -1
+            for step, inputs in enumerate(epoch_iterator):
+                total_batched_samples += 1
+                if rng_to_sync:
+                    self._load_rng_state(resume_from_checkpoint)
+                    rng_to_sync = False
+
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    if steps_trained_progress_bar is not None:
+                        steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
+                    continue
+                elif steps_trained_progress_bar is not None:
+                    steps_trained_progress_bar.close()
+                    steps_trained_progress_bar = None
+
+                if step % args.gradient_accumulation_steps == 0:
+                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+
+                if (
+                    (total_batched_samples % args.gradient_accumulation_steps != 0)
+                    and args.local_rank != -1
+                    and args._no_sync_in_gradient_accumulation
+                ):
+                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
+                    with model.no_sync():
+                        tr_loss_step = self.training_step(model, inputs)
+                else:
+                    tr_loss_step = self.training_step(model, inputs)
+
+                if (
+                    args.logging_nan_inf_filter
+                    and not is_torch_tpu_available()
+                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+                ):
+                    # if loss is nan or inf simply add the average of previous logged losses
+                    tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
+                else:
+                    tr_loss += tr_loss_step
+
+                self.current_flos += float(self.floating_point_ops(inputs))
+
+                # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
+                if self.deepspeed:
+                    self.deepspeed.step()
+
+                if total_batched_samples % args.gradient_accumulation_steps == 0 or (
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    steps_in_epoch <= args.gradient_accumulation_steps
+                    and (step + 1) == steps_in_epoch
+                ):
+                    # Gradient clipping
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
+                        # deepspeed does its own clipping
+
+                        if self.do_grad_scaling:
+                            # Reduce gradients first for XLA
+                            if is_torch_tpu_available():
+                                gradients = xm._fetch_gradients(self.optimizer)
+                                xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
+                            # AMP: gradients need unscaling
+                            self.scaler.unscale_(self.optimizer)
+
+                        if is_sagemaker_mp_enabled() and args.fp16:
+                            self.optimizer.clip_master_grads(args.max_grad_norm)
+                        elif hasattr(self.optimizer, "clip_grad_norm"):
+                            # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping
+                            self.optimizer.clip_grad_norm(args.max_grad_norm)
+                        elif hasattr(model, "clip_grad_norm_"):
+                            # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
+                            model.clip_grad_norm_(args.max_grad_norm)
+                        else:
+                            # Revert to normal clipping otherwise, handling Apex or full precision
+                            nn.utils.clip_grad_norm_(
+                                amp.master_params(self.optimizer) if self.use_apex else model.parameters(),
+                                args.max_grad_norm,
+                            )
+
+                    # Optimizer step
+                    optimizer_was_run = True
+                    if self.deepspeed:
+                        pass  # called outside the loop
+                    elif is_torch_tpu_available():
+                        if self.do_grad_scaling:
+                            self.scaler.step(self.optimizer)
+                            self.scaler.update()
+                        else:
+                            xm.optimizer_step(self.optimizer)
+                    elif self.do_grad_scaling:
+                        scale_before = self.scaler.get_scale()
+                        self.scaler.step(self.optimizer)
+                        self.scaler.update()
+                        scale_after = self.scaler.get_scale()
+                        optimizer_was_run = scale_before <= scale_after
+                    else:
+                        self.optimizer.step()
+
+                    if optimizer_was_run and not self.deepspeed:
+                        self.lr_scheduler.step()
+
+                    model.zero_grad()
+                    self.state.global_step += 1
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
+                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+
+                    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
+                else:
+                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+            if step < 0:
+                logger.warning(
+                    "There seems to be not a single sample in your epoch_iterator, stopping training at step"
+                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
+                    f" num_steps ({max_steps}) higher than the number of available samples."
+                )
+                self.control.should_training_stop = True
+
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
+
+            if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
+                if is_torch_tpu_available():
+                    # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+                    xm.master_print(met.metrics_report())
+                else:
+                    logger.warning(
+                        "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
+                        "configured. Check your training configuration if this is unexpected."
+                    )
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
+        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
+            # Wait for everyone to get here so we are sur the model has been saved by process 0.
+            if is_torch_tpu_available():
+                xm.rendezvous("load_best_model_at_end")
+            elif args.local_rank != -1:
+                dist.barrier()
+            elif is_sagemaker_mp_enabled():
+                smp.barrier()
+
+            self._load_best_model()
+
+        # add remaining tr_loss
+        self._total_loss_scalar += tr_loss.item()
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+        self.store_flos()
+        metrics["total_flos"] = self.state.total_flos
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self._memory_tracker.stop_and_update_metrics(metrics)
+
+        self.log(metrics)
+
+        run_dir = self._get_output_dir(trial)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
+
+        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
+        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
+            for checkpoint in checkpoints_sorted:
+                if checkpoint != self.state.best_model_checkpoint:
+                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+                    shutil.rmtree(checkpoint)
+
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
+
+    def _get_output_dir(self, trial):
+        if self.hp_search_backend is not None and trial is not None:
+            if self.hp_search_backend == HPSearchBackend.OPTUNA:
+                run_id = trial.number
+            elif self.hp_search_backend == HPSearchBackend.RAY:
+                from ray import tune
+
+                run_id = tune.get_trial_id()
+            elif self.hp_search_backend == HPSearchBackend.SIGOPT:
+                run_id = trial.id
+            elif self.hp_search_backend == HPSearchBackend.WANDB:
+                import wandb
+
+                run_id = wandb.run.id
+            run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}"
+            run_dir = os.path.join(self.args.output_dir, run_name)
+        else:
+            run_dir = self.args.output_dir
+        return run_dir
+
+    def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
+        if model is None:
+            model = self.model
+
+        if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)) and not os.path.isfile(
+            os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
+        ):
+            raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+
+        logger.info(f"Loading model from {resume_from_checkpoint}.")
+
+        if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)):
+            config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME))
+            checkpoint_version = config.transformers_version
+            if checkpoint_version is not None and checkpoint_version != __version__:
+                logger.warning(
+                    f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
+                    f"Transformers but your current version is {__version__}. This is not recommended and could "
+                    "yield to errors or unwanted behaviors."
+                )
+
+        if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
+            # If the model is on the GPU, it still works!
+            if is_sagemaker_mp_enabled():
+                if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")):
+                    # If the 'user_content.pt' file exists, load with the new smp api.
+                    # Checkpoint must have been saved with the new smp api.
+                    smp.resume_from_checkpoint(
+                        path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False
+                    )
+                else:
+                    # If the 'user_content.pt' file does NOT exist, load with the old smp api.
+                    # Checkpoint must have been saved with the old smp api.
+                    if hasattr(self.args, "fp16") and self.args.fp16 is True:
+                        logger.warning(
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                        )
+                    state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
+                    # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
+                    state_dict["_smp_is_partial"] = False
+                    load_result = model.load_state_dict(state_dict, strict=True)
+                    # release memory
+                    del state_dict
+            else:
+                # We load the model state dict on the CPU to avoid an OOM error.
+                state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
+                # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                # which takes *args instead of **kwargs
+                load_result = model.load_state_dict(state_dict, False)
+                # release memory
+                del state_dict
+                self._issue_warnings_after_load(load_result)
+        else:
+            # We load the sharded checkpoint
+            load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
+            if not is_sagemaker_mp_enabled():
+                self._issue_warnings_after_load(load_result)
+
+    def _load_best_model(self):
+        logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
+        best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
+        model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+        if os.path.exists(best_model_path):
+            if self.deepspeed:
+                if self.model_wrapped is not None:
+                    # this removes the pre-hooks from the previous engine
+                    self.model_wrapped.destroy()
+                    self.model_wrapped = None
+
+                # temp hack until Deepspeed fixes the problem with resume from an existing engine that did some stepping
+                deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
+                    self,
+                    num_training_steps=self.args.max_steps,
+                    resume_from_checkpoint=self.state.best_model_checkpoint,
+                )
+                self.model = deepspeed_engine.module
+                self.model_wrapped = deepspeed_engine
+                self.deepspeed = deepspeed_engine
+                self.optimizer = optimizer
+                self.lr_scheduler = lr_scheduler
+            else:
+                if is_sagemaker_mp_enabled():
+                    if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
+                        # If the 'user_content.pt' file exists, load with the new smp api.
+                        # Checkpoint must have been saved with the new smp api.
+                        smp.resume_from_checkpoint(
+                            path=self.state.best_model_checkpoint,
+                            tag=WEIGHTS_NAME,
+                            partial=False,
+                            load_optimizer=False,
+                        )
+                    else:
+                        # If the 'user_content.pt' file does NOT exist, load with the old smp api.
+                        # Checkpoint must have been saved with the old smp api.
+                        state_dict = torch.load(best_model_path, map_location="cpu")
+                        state_dict["_smp_is_partial"] = False
+                        load_result = model.load_state_dict(state_dict, strict=True)
+                else:
+                    # We load the model state dict on the CPU to avoid an OOM error.
+                    state_dict = torch.load(best_model_path, map_location="cpu")
+                    # If the model is on the GPU, it still works!
+                    # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                    # which takes *args instead of **kwargs
+                    load_result = model.load_state_dict(state_dict, False)
+                if not is_sagemaker_mp_enabled():
+                    self._issue_warnings_after_load(load_result)
+        elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
+            load_result = load_sharded_checkpoint(
+                model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled()
+            )
+            if not is_sagemaker_mp_enabled():
+                self._issue_warnings_after_load(load_result)
+        else:
+            logger.warning(
+                f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
+                "on multiple nodes, you should activate `--save_on_each_node`."
+            )
+
+    def _issue_warnings_after_load(self, load_result):
+        if len(load_result.missing_keys) != 0:
+            if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
+                self.model._keys_to_ignore_on_save
+            ):
+                self.model.tie_weights()
+            else:
+                logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
+        if len(load_result.unexpected_keys) != 0:
+            logger.warning(
+                f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
+            )
+
+    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
+        if self.control.should_log:
+            if is_torch_tpu_available():
+                xm.mark_step()
+
+            logs: Dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+
+            # reset tr_loss to zero
+            tr_loss -= tr_loss
+
+            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+            logs["learning_rate"] = self._get_learning_rate()
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+            self.store_flos()
+
+            self.log(logs)
+
+        metrics = None
+        if self.control.should_evaluate:
+            if isinstance(self.eval_dataset, dict):
+                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
+                    metrics = self.evaluate(
+                        eval_dataset=eval_dataset,
+                        ignore_keys=ignore_keys_for_eval,
+                        metric_key_prefix=f"eval_{eval_dataset_name}",
+                    )
+            else:
+                metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
+            self._report_to_hp_search(trial, self.state.global_step, metrics)
+
+        if self.control.should_save:
+            self._save_checkpoint(model, trial, metrics=metrics)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+    def _load_rng_state(self, checkpoint):
+        # Load RNG states from `checkpoint`
+        if checkpoint is None:
+            return
+
+        if self.args.world_size > 1:
+            process_index = self.args.process_index
+            rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
+            if not os.path.isfile(rng_file):
+                logger.info(
+                    f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
+                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
+                )
+                return
+        else:
+            rng_file = os.path.join(checkpoint, "rng_state.pth")
+            if not os.path.isfile(rng_file):
+                logger.info(
+                    "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
+                    "fashion, reproducibility is not guaranteed."
+                )
+                return
+
+        checkpoint_rng_state = torch.load(rng_file)
+        random.setstate(checkpoint_rng_state["python"])
+        np.random.set_state(checkpoint_rng_state["numpy"])
+        torch.random.set_rng_state(checkpoint_rng_state["cpu"])
+        if torch.cuda.is_available():
+            if self.args.local_rank != -1:
+                torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
+            else:
+                try:
+                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+                except Exception as e:
+                    logger.info(
+                        f"Didn't manage to set back the RNG states of the GPU because of the following error:\n {e}"
+                        "\nThis won't yield the same results as if the training had not been interrupted."
+                    )
+        if is_torch_tpu_available():
+            xm.set_rng_state(checkpoint_rng_state["xla"])
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
+        # want to save except FullyShardedDDP.
+        # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
+
+        # Save model checkpoint
+        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+        if self.hp_search_backend is None and trial is None:
+            self.store_flos()
+
+        run_dir = self._get_output_dir(trial=trial)
+        output_dir = os.path.join(run_dir, checkpoint_folder)
+        self.save_model(output_dir, _internal_call=True)
+        if self.deepspeed:
+            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
+            # config `stage3_gather_16bit_weights_on_model_save` is True
+            self.deepspeed.save_checkpoint(output_dir)
+
+        # Save optimizer and scheduler
+        if self.sharded_ddp == ShardedDDPOption.SIMPLE:
+            self.optimizer.consolidate_state_dict()
+
+        if is_torch_tpu_available():
+            xm.rendezvous("saving_optimizer_states")
+            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+                reissue_pt_warnings(caught_warnings)
+        elif is_sagemaker_mp_enabled():
+            opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
+            smp.barrier()
+            if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
+                smp.save(
+                    opt_state_dict,
+                    os.path.join(output_dir, OPTIMIZER_NAME),
+                    partial=True,
+                    v3=smp.state.cfg.shard_optimizer_state,
+                )
+            if self.args.should_save:
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+                reissue_pt_warnings(caught_warnings)
+                if self.do_grad_scaling:
+                    torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
+        elif self.args.should_save and not self.deepspeed:
+            # deepspeed.save_checkpoint above saves model/optim/sched
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+            reissue_pt_warnings(caught_warnings)
+            if self.do_grad_scaling:
+                torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
+
+        # Determine the new best metric / best model checkpoint
+        if metrics is not None and self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (
+                self.state.best_metric is None
+                or self.state.best_model_checkpoint is None
+                or operator(metric_value, self.state.best_metric)
+            ):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+        # Save the Trainer state
+        if self.args.should_save:
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+
+        # Save RNG state in non-distributed training
+        rng_states = {
+            "python": random.getstate(),
+            "numpy": np.random.get_state(),
+            "cpu": torch.random.get_rng_state(),
+        }
+        if torch.cuda.is_available():
+            if self.args.local_rank == -1:
+                # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
+                rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
+            else:
+                rng_states["cuda"] = torch.cuda.random.get_rng_state()
+
+        if is_torch_tpu_available():
+            rng_states["xla"] = xm.get_rng_state()
+
+        # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
+        # not yet exist.
+        os.makedirs(output_dir, exist_ok=True)
+
+        if self.args.world_size <= 1:
+            torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+        else:
+            torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
+
+        if self.args.push_to_hub:
+            self._push_from_checkpoint(output_dir)
+
+        # Maybe delete some older checkpoints.
+        if self.args.should_save:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        """If optimizer and scheduler states exist, load them."""
+        if checkpoint is None:
+            return
+
+        if self.deepspeed:
+            # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
+            return
+
+        checkpoint_file_exists = (
+            glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
+            if is_sagemaker_mp_enabled()
+            else os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
+        )
+        if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
+            # Load in optimizer and scheduler states
+            if is_torch_tpu_available():
+                # On TPU we have to take some extra precautions to properly load the states on the right device.
+                optimizer_state = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu")
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu")
+                reissue_pt_warnings(caught_warnings)
+
+                xm.send_cpu_data_to_device(optimizer_state, self.args.device)
+                xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device)
+
+                self.optimizer.load_state_dict(optimizer_state)
+                self.lr_scheduler.load_state_dict(lr_scheduler_state)
+            else:
+                map_location = "cpu" if is_sagemaker_mp_enabled() else self.args.device
+                if is_sagemaker_mp_enabled():
+                    if os.path.isfile(os.path.join(checkpoint, "user_content.pt")):
+                        # Optimizer checkpoint was saved with smp >= 1.10
+                        def opt_load_hook(mod, opt):
+                            opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
+
+                    else:
+                        # Optimizer checkpoint was saved with smp < 1.10
+                        def opt_load_hook(mod, opt):
+                            if IS_SAGEMAKER_MP_POST_1_10:
+                                opt.load_state_dict(
+                                    smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True, back_compat=True)
+                                )
+                            else:
+                                opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
+
+                    self.model_wrapped.register_post_step_hook(opt_load_hook)
+                else:
+                    self.optimizer.load_state_dict(
+                        torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
+                    )
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
+                reissue_pt_warnings(caught_warnings)
+                if self.do_grad_scaling and os.path.isfile(os.path.join(checkpoint, SCALER_NAME)):
+                    self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, SCALER_NAME)))
+
+    def hyperparameter_search(
+        self,
+        hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
+        compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
+        n_trials: int = 20,
+        direction: str = "minimize",
+        backend: Optional[Union["str", HPSearchBackend]] = None,
+        hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
+        **kwargs,
+    ) -> BestRun:
+        """
+        Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined
+        by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
+        the sum of all metrics otherwise.
+
+        <Tip warning={true}>
+
+        To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
+        reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
+        subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
+        optimizer/scheduler.
+
+        </Tip>
+
+        Args:
+            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
+                A function that defines the hyperparameter search space. Will default to
+                [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or
+                [`~trainer_utils.default_hp_space_sigopt`] depending on your backend.
+            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
+                A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
+                method. Will default to [`~trainer_utils.default_compute_objective`].
+            n_trials (`int`, *optional*, defaults to 100):
+                The number of trial runs to test.
+            direction (`str`, *optional*, defaults to `"minimize"`):
+                Whether to optimize greater or lower objects. Can be `"minimize"` or `"maximize"`, you should pick
+                `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics.
+            backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
+                The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending
+                on which one is installed. If all are installed, will default to optuna.
+            hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
+                A function that defines the trial/run name. Will default to None.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to `optuna.create_study` or `ray.tune.run`. For more
+                information see:
+
+                - the documentation of
+                  [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
+                - the documentation of [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run)
+                - the documentation of [sigopt](https://app.sigopt.com/docs/endpoints/experiments/create)
+
+        Returns:
+            [`trainer_utils.BestRun`]: All the information about the best run. Experiment summary can be found in
+            `run_summary` attribute for Ray backend.
+        """
+        if backend is None:
+            backend = default_hp_search_backend()
+            if backend is None:
+                raise RuntimeError(
+                    "At least one of optuna or ray should be installed. "
+                    "To install optuna run `pip install optuna`. "
+                    "To install ray run `pip install ray[tune]`. "
+                    "To install sigopt run `pip install sigopt`."
+                )
+        backend = HPSearchBackend(backend)
+        if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
+            raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
+        if backend == HPSearchBackend.RAY and not is_ray_tune_available():
+            raise RuntimeError(
+                "You picked the Ray Tune backend, but it is not installed. Use `pip install 'ray[tune]'`."
+            )
+        if backend == HPSearchBackend.SIGOPT and not is_sigopt_available():
+            raise RuntimeError("You picked the sigopt backend, but it is not installed. Use `pip install sigopt`.")
+        if backend == HPSearchBackend.WANDB and not is_wandb_available():
+            raise RuntimeError("You picked the wandb backend, but it is not installed. Use `pip install wandb`.")
+        self.hp_search_backend = backend
+        if self.model_init is None:
+            raise RuntimeError(
+                "To use hyperparameter search, you need to pass your model through a model_init function."
+            )
+
+        self.hp_space = default_hp_space[backend] if hp_space is None else hp_space
+        self.hp_name = hp_name
+        self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
+
+        backend_dict = {
+            HPSearchBackend.OPTUNA: run_hp_search_optuna,
+            HPSearchBackend.RAY: run_hp_search_ray,
+            HPSearchBackend.SIGOPT: run_hp_search_sigopt,
+            HPSearchBackend.WANDB: run_hp_search_wandb,
+        }
+        best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
+
+        self.hp_search_backend = None
+        return best_run
+
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        if self.state.epoch is not None:
+            logs["epoch"] = round(self.state.epoch, 2)
+
+        output = {**logs, **{"step": self.state.global_step}}
+        self.state.log_history.append(output)
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+
+    def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
+        """
+        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
+        """
+        if isinstance(data, Mapping):
+            return type(data)({k: self._prepare_input(v) for k, v in data.items()})
+        elif isinstance(data, (tuple, list)):
+            return type(data)(self._prepare_input(v) for v in data)
+        elif isinstance(data, torch.Tensor):
+            kwargs = {"device": self.args.device}
+            if self.deepspeed and (torch.is_floating_point(data) or torch.is_complex(data)):
+                # NLP models inputs are int/uint and those get adjusted to the right dtype of the
+                # embedding. Other models such as wav2vec2's inputs are already float and thus
+                # may need special handling to match the dtypes of the model
+                kwargs.update({"dtype": self.args.hf_deepspeed_config.dtype()})
+            return data.to(**kwargs)
+        return data
+
+    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if len(inputs) == 0:
+            raise ValueError(
+                "The batch received was empty, your model won't be able to train on it. Double-check that your "
+                f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}."
+            )
+        if self.args.past_index >= 0 and self._past is not None:
+            inputs["mems"] = self._past
+
+        return inputs
+
+    def compute_loss_context_manager(self):
+        """
+        A helper wrapper to group together context managers.
+        """
+        return self.autocast_smart_context_manager()
+
+    def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
+        """
+        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
+        arguments, depending on the situation.
+        """
+        if self.use_cuda_amp or self.use_cpu_amp:
+            if is_torch_greater_or_equal_than_1_10:
+                ctx_manager = (
+                    torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
+                    if self.use_cpu_amp
+                    else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
+                )
+            else:
+                ctx_manager = torch.cuda.amp.autocast()
+        else:
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+
+        return ctx_manager
+
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to train.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `torch.Tensor`: The tensor with training loss on this batch.
+        """
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+
+        if is_sagemaker_mp_enabled():
+            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
+            return loss_mb.reduce_mean().detach().to(self.args.device)
+
+        with self.compute_loss_context_manager():
+            loss = self.compute_loss(model, inputs)
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
+            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
+            loss = loss / self.args.gradient_accumulation_steps
+
+        if self.do_grad_scaling:
+            self.scaler.scale(loss).backward()
+        elif self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        elif self.deepspeed:
+            # loss gets scaled under gradient_accumulation_steps in deepspeed
+            loss = self.deepspeed.backward(loss)
+        else:
+            loss.backward()
+
+        return loss.detach()
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+        Subclass and override for custom behavior.
+        """
+        if self.label_smoother is not None and "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        outputs = model(**inputs)
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                loss = self.label_smoother(outputs, labels, shift_labels=True)
+            else:
+                loss = self.label_smoother(outputs, labels)
+        else:
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+        return (loss, outputs) if return_outputs else loss
+
+    def is_local_process_zero(self) -> bool:
+        """
+        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
+        machines) main process.
+        """
+        return self.args.local_process_index == 0
+
+    def is_world_process_zero(self) -> bool:
+        """
+        Whether or not this process is the global main process (when training in a distributed fashion on several
+        machines, this is only going to be `True` for one process).
+        """
+        # Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global
+        # process index.
+        if is_sagemaker_mp_enabled():
+            return smp.rank() == 0
+        else:
+            return self.args.process_index == 0
+
+    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        """
+        Will save the model, so you can reload it using `from_pretrained()`.
+
+        Will only save from the main process.
+        """
+
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if is_torch_tpu_available():
+            self._save_tpu(output_dir)
+        elif is_sagemaker_mp_enabled():
+            # Calling the state_dict needs to be done on the wrapped model and on all processes.
+            os.makedirs(output_dir, exist_ok=True)
+            state_dict = self.model_wrapped.state_dict()
+            if self.args.should_save:
+                self._save(output_dir, state_dict=state_dict)
+            if IS_SAGEMAKER_MP_POST_1_10:
+                # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
+                Path(os.path.join(output_dir, "user_content.pt")).touch()
+        elif (
+            ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp
+            or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
+            or self.fsdp is not None
+        ):
+            state_dict = self.model.state_dict()
+
+            if self.args.should_save:
+                self._save(output_dir, state_dict=state_dict)
+        elif self.deepspeed:
+            # this takes care of everything as long as we aren't under zero3
+            if self.args.should_save:
+                self._save(output_dir)
+
+            if is_deepspeed_zero3_enabled():
+                # It's too complicated to try to override different places where the weights dump gets
+                # saved, so since under zero3 the file is bogus, simply delete it. The user should
+                # either user deepspeed checkpoint to resume or to recover full weights use
+                # zero_to_fp32.py stored in the checkpoint.
+                if self.args.should_save:
+                    file = os.path.join(output_dir, WEIGHTS_NAME)
+                    if os.path.isfile(file):
+                        # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
+                        os.remove(file)
+
+                # now save the real model if stage3_gather_16bit_weights_on_model_save=True
+                # if false it will not be saved.
+                # This must be called on all ranks
+                if not self.deepspeed.save_16bit_model(output_dir, WEIGHTS_NAME):
+                    logger.warning(
+                        "deepspeed.save_16bit_model didn't save the model, since"
+                        " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
+                        " zero_to_fp32.py to recover weights"
+                    )
+                    self.deepspeed.save_checkpoint(output_dir)
+
+        elif self.args.should_save:
+            self._save(output_dir)
+
+        # Push to the Hub when `save_model` is called by the user.
+        if self.args.push_to_hub and not _internal_call:
+            self.push_to_hub(commit_message="Model save")
+
+    def _save_tpu(self, output_dir: Optional[str] = None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        logger.info(f"Saving model checkpoint to {output_dir}")
+
+        if xm.is_master_ordinal():
+            os.makedirs(output_dir, exist_ok=True)
+            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        xm.rendezvous("saving_checkpoint")
+        if not isinstance(self.model, PreTrainedModel):
+            if isinstance(unwrap_model(self.model), PreTrainedModel):
+                unwrap_model(self.model).save_pretrained(
+                    output_dir,
+                    is_main_process=self.args.should_save,
+                    state_dict=self.model.state_dict(),
+                    save_function=xm.save,
+                )
+            else:
+                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
+                state_dict = self.model.state_dict()
+                xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            self.model.save_pretrained(output_dir, is_main_process=self.args.should_save, save_function=xm.save)
+        if self.tokenizer is not None and self.args.should_save:
+            self.tokenizer.save_pretrained(output_dir)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        # If we are executing this function, we are the process zero, so we don't check for that.
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, PreTrainedModel):
+            if isinstance(unwrap_model(self.model), PreTrainedModel):
+                if state_dict is None:
+                    state_dict = self.model.state_dict()
+                unwrap_model(self.model).save_pretrained(output_dir, state_dict=filtered_state_dict)
+            else:
+                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
+                if state_dict is None:
+                    state_dict = self.model.state_dict()
+                torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            if self.save_prefixencoder:
+                print("Saving PrefixEncoder")
+                state_dict = self.model.state_dict()
+                filtered_state_dict = {}
+                for k, v in self.model.named_parameters():
+                    if v.requires_grad:
+                        filtered_state_dict[k] = state_dict[k]
+                self.model.save_pretrained(output_dir, state_dict=filtered_state_dict)
+            else:
+                print("Saving the whole model")
+                self.model.save_pretrained(output_dir, state_dict=state_dict)
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+    def store_flos(self):
+        # Storing the number of floating-point operations that went into the model
+        if self.args.local_rank != -1:
+            self.state.total_flos += (
+                distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item()
+            )
+            self.current_flos = 0
+        else:
+            self.state.total_flos += self.current_flos
+            self.current_flos = 0
+
+    def _sorted_checkpoints(
+        self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False
+    ) -> List[str]:
+        ordering_and_checkpoint_path = []
+
+        glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+
+        for path in glob_checkpoints:
+            if use_mtime:
+                ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+            else:
+                regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+                if regex_match is not None and regex_match.groups() is not None:
+                    ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+        checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+        # Make sure we don't delete the best model.
+        if self.state.best_model_checkpoint is not None:
+            best_model_index = checkpoints_sorted.index(str(Path(self.state.best_model_checkpoint)))
+            for i in range(best_model_index, len(checkpoints_sorted) - 2):
+                checkpoints_sorted[i], checkpoints_sorted[i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i]
+        return checkpoints_sorted
+
+    def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
+        if self.args.save_total_limit is None or self.args.save_total_limit <= 0:
+            return
+
+        # Check if we should delete older checkpoint(s)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir)
+        if len(checkpoints_sorted) <= self.args.save_total_limit:
+            return
+
+        # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which
+        # we don't do to allow resuming.
+        save_total_limit = self.args.save_total_limit
+        if (
+            self.state.best_model_checkpoint is not None
+            and self.args.save_total_limit == 1
+            and checkpoints_sorted[-1] != self.state.best_model_checkpoint
+        ):
+            save_total_limit = 2
+
+        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+        for checkpoint in checkpoints_to_be_deleted:
+            logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+            shutil.rmtree(checkpoint, ignore_errors=True)
+
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
+                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is "eval" (default)
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        start_time = time.time()
+
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        output = eval_loop(
+            eval_dataloader,
+            description="Evaluation",
+            # No point gathering the predictions if there are no metrics, otherwise we defer to
+            # self.args.prediction_loss_only
+            prediction_loss_only=True if self.compute_metrics is None else None,
+            ignore_keys=ignore_keys,
+            metric_key_prefix=metric_key_prefix,
+        )
+
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
+
+        self.log(output.metrics)
+
+        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
+
+        self._memory_tracker.stop_and_update_metrics(output.metrics)
+
+        return output.metrics
+
+    def predict(
+        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test"
+    ) -> PredictionOutput:
+        """
+        Run prediction and returns predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "test_bleu" if the prefix is "test" (default)
+
+        <Tip>
+
+        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
+        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
+        one array. The padding index is -100.
+
+        </Tip>
+
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        start_time = time.time()
+
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        output = eval_loop(
+            test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
+        )
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
+
+        self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
+        self._memory_tracker.stop_and_update_metrics(output.metrics)
+
+        return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        args = self.args
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+        # if eval is called w/o train init deepspeed here
+        if args.deepspeed and not self.deepspeed:
+            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
+            # from the checkpoint eventually
+            deepspeed_engine, _, _ = deepspeed_init(
+                self, num_training_steps=0, resume_from_checkpoint=None, inference=True
+            )
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
+
+        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
+
+        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
+        # while ``train`` is running, cast it to the right dtype first and then put on device
+        if not self.is_in_train:
+            if args.fp16_full_eval:
+                model = model.to(dtype=torch.float16, device=args.device)
+            elif args.bf16_full_eval:
+                model = model.to(dtype=torch.bfloat16, device=args.device)
+
+        batch_size = self.args.eval_batch_size
+
+        logger.info(f"***** Running {description} *****")
+        if has_length(dataloader):
+            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
+        else:
+            logger.info("  Num examples: Unknown")
+        logger.info(f"  Batch size = {batch_size}")
+
+        model.eval()
+
+        self.callback_handler.eval_dataloader = dataloader
+        # Do this before wrapping.
+        eval_dataset = getattr(dataloader, "dataset", None)
+
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
+
+        if args.past_index >= 0:
+            self._past = None
+
+        # Initialize containers
+        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+        losses_host = None
+        preds_host = None
+        labels_host = None
+        inputs_host = None
+
+        # losses/preds/labels on CPU (final containers)
+        all_losses = None
+        all_preds = None
+        all_labels = None
+        all_inputs = None
+        # Will be useful when we have an iterable dataset so don't know its length.
+
+        observed_num_examples = 0
+        # Main evaluation loop
+        for step, inputs in enumerate(dataloader):
+            # Update the observed num examples
+            observed_batch_size = find_batch_size(inputs)
+            if observed_batch_size is not None:
+                observed_num_examples += observed_batch_size
+                # For batch samplers, batch_size is not known by the dataloader in advance.
+                if batch_size is None:
+                    batch_size = observed_batch_size
+
+            # Prediction step
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+
+            if is_torch_tpu_available():
+                xm.mark_step()
+
+            # Update containers on host
+            if loss is not None:
+                losses = self._nested_gather(loss.repeat(batch_size))
+                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+            if labels is not None:
+                labels = self._pad_across_processes(labels)
+                labels = self._nested_gather(labels)
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_decode = self._pad_across_processes(inputs_decode)
+                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
+            if logits is not None:
+                logits = self._pad_across_processes(logits)
+                logits = self._nested_gather(logits)
+                if self.preprocess_logits_for_metrics is not None:
+                    logits = self.preprocess_logits_for_metrics(logits, labels)
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+                if losses_host is not None:
+                    losses = nested_numpify(losses_host)
+                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+                if preds_host is not None:
+                    logits = nested_numpify(preds_host)
+                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+                if inputs_host is not None:
+                    inputs_decode = nested_numpify(inputs_host)
+                    all_inputs = (
+                        inputs_decode
+                        if all_inputs is None
+                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+                    )
+                if labels_host is not None:
+                    labels = nested_numpify(labels_host)
+                    all_labels = (
+                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        if losses_host is not None:
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+        if preds_host is not None:
+            logits = nested_numpify(preds_host)
+            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if inputs_host is not None:
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
+        if labels_host is not None:
+            labels = nested_numpify(labels_host)
+            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+        # Number of samples
+        if has_length(eval_dataset):
+            num_samples = len(eval_dataset)
+        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
+        # methods. Therefore we need to make sure it also has the attribute.
+        elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
+            num_samples = eval_dataset.num_examples
+        else:
+            if has_length(dataloader):
+                num_samples = self.num_examples(dataloader)
+            else:  # both len(dataloader.dataset) and len(dataloader) fail
+                num_samples = observed_num_examples
+        if num_samples == 0 and observed_num_examples > 0:
+            num_samples = observed_num_examples
+
+        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
+        # samplers has been rounded to a multiple of batch_size, so we truncate.
+        if all_losses is not None:
+            all_losses = all_losses[:num_samples]
+        if all_preds is not None:
+            all_preds = nested_truncate(all_preds, num_samples)
+        if all_labels is not None:
+            all_labels = nested_truncate(all_labels, num_samples)
+        if all_inputs is not None:
+            all_inputs = nested_truncate(all_inputs, num_samples)
+
+        # Metrics!
+        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if all_losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+        if hasattr(self, "jit_compilation_time"):
+            metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
+
+    def _nested_gather(self, tensors, name=None):
+        """
+        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
+        concatenating them to `gathered`
+        """
+        if tensors is None:
+            return
+        if is_torch_tpu_available():
+            if name is None:
+                name = "nested_gather"
+            tensors = nested_xla_mesh_reduce(tensors, name)
+        elif is_sagemaker_mp_enabled():
+            tensors = smp_gather(tensors)
+        elif self.args.local_rank != -1:
+            tensors = distributed_concat(tensors)
+        return tensors
+
+    # Copied from Accelerate.
+    def _pad_across_processes(self, tensor, pad_index=-100):
+        """
+        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
+        they can safely be gathered.
+        """
+        if isinstance(tensor, (list, tuple)):
+            return type(tensor)(self._pad_across_processes(t, pad_index=pad_index) for t in tensor)
+        elif isinstance(tensor, dict):
+            return type(tensor)({k: self._pad_across_processes(v, pad_index=pad_index) for k, v in tensor.items()})
+        elif not isinstance(tensor, torch.Tensor):
+            raise TypeError(
+                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
+            )
+
+        if len(tensor.shape) < 2:
+            return tensor
+        # Gather all sizes
+        size = torch.tensor(tensor.shape, device=tensor.device)[None]
+        sizes = self._nested_gather(size).cpu()
+
+        max_size = max(s[1] for s in sizes)
+        # When extracting XLA graphs for compilation, max_size is 0,
+        # so use inequality to avoid errors.
+        if tensor.shape[1] >= max_size:
+            return tensor
+
+        # Then pad to the maximum size
+        old_size = tensor.shape
+        new_size = list(old_size)
+        new_size[1] = max_size
+        new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
+        new_tensor[:, : old_size[1]] = tensor
+        return new_tensor
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+
+        Return:
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
+        # For CLIP-like models capable of returning loss values.
+        # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
+        # is `True` in `model.forward`.
+        return_loss = inputs.get("return_loss", None)
+        if return_loss is None:
+            return_loss = self.can_return_loss
+        loss_without_labels = True if len(self.label_names) == 0 and return_loss else False
+
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels or loss_without_labels:
+            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        with torch.no_grad():
+            if is_sagemaker_mp_enabled():
+                raw_outputs = smp_forward_only(model, inputs)
+                if has_labels or loss_without_labels:
+                    if isinstance(raw_outputs, dict):
+                        loss_mb = raw_outputs["loss"]
+                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
+                    else:
+                        loss_mb = raw_outputs[0]
+                        logits_mb = raw_outputs[1:]
+
+                    loss = loss_mb.reduce_mean().detach().cpu()
+                    logits = smp_nested_concat(logits_mb)
+                else:
+                    loss = None
+                    if isinstance(raw_outputs, dict):
+                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
+                    else:
+                        logits_mb = raw_outputs
+                    logits = smp_nested_concat(logits_mb)
+            else:
+                if has_labels or loss_without_labels:
+                    with self.compute_loss_context_manager():
+                        loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+                    loss = loss.mean().detach()
+
+                    if isinstance(outputs, dict):
+                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
+                    else:
+                        logits = outputs[1:]
+                else:
+                    loss = None
+                    with self.compute_loss_context_manager():
+                        outputs = model(**inputs)
+                    if isinstance(outputs, dict):
+                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
+                    else:
+                        logits = outputs
+                    # TODO: this needs to be fixed and made cleaner later.
+                    if self.args.past_index >= 0:
+                        self._past = outputs[self.args.past_index - 1]
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        logits = nested_detach(logits)
+        if len(logits) == 1:
+            logits = logits[0]
+
+        return (loss, logits, labels)
+
+    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
+        """
+        For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
+        operations for every backward + forward pass. If using another model, either implement such a method in the
+        model or subclass and override this method.
+
+        Args:
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+        Returns:
+            `int`: The number of floating-point operations.
+        """
+        if hasattr(self.model, "floating_point_ops"):
+            return self.model.floating_point_ops(inputs)
+        else:
+            return 0
+
+    def init_git_repo(self, at_init: bool = False):
+        """
+        Initializes a git repo in `self.args.hub_model_id`.
+
+        Args:
+            at_init (`bool`, *optional*, defaults to `False`):
+                Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
+                `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
+                out.
+        """
+        if not self.is_world_process_zero():
+            return
+        if self.args.hub_model_id is None:
+            repo_name = Path(self.args.output_dir).absolute().name
+        else:
+            repo_name = self.args.hub_model_id
+        if "/" not in repo_name:
+            repo_name = get_full_repo_name(repo_name, token=self.args.hub_token)
+
+        # Make sure the repo exists.
+        create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True)
+        try:
+            self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=self.args.hub_token)
+        except EnvironmentError:
+            if self.args.overwrite_output_dir and at_init:
+                # Try again after wiping output_dir
+                shutil.rmtree(self.args.output_dir)
+                self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=self.args.hub_token)
+            else:
+                raise
+
+        self.repo.git_pull()
+
+        # By default, ignore the checkpoint folders
+        if (
+            not os.path.exists(os.path.join(self.args.output_dir, ".gitignore"))
+            and self.args.hub_strategy != HubStrategy.ALL_CHECKPOINTS
+        ):
+            with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
+                writer.writelines(["checkpoint-*/"])
+
+        # Add "*.sagemaker" to .gitignore if using SageMaker
+        if os.environ.get("SM_TRAINING_ENV"):
+            self._add_sm_patterns_to_gitignore()
+
+        self.push_in_progress = None
+
+    def create_model_card(
+        self,
+        language: Optional[str] = None,
+        license: Optional[str] = None,
+        tags: Union[str, List[str], None] = None,
+        model_name: Optional[str] = None,
+        finetuned_from: Optional[str] = None,
+        tasks: Union[str, List[str], None] = None,
+        dataset_tags: Union[str, List[str], None] = None,
+        dataset: Union[str, List[str], None] = None,
+        dataset_args: Union[str, List[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            language (`str`, *optional*):
+                The language of the model (if applicable)
+            license (`str`, *optional*):
+                The license of the model. Will default to the license of the pretrained model used, if the original
+                model given to the `Trainer` comes from a repo on the Hub.
+            tags (`str` or `List[str]`, *optional*):
+                Some tags to be included in the metadata of the model card.
+            model_name (`str`, *optional*):
+                The name of the model.
+            finetuned_from (`str`, *optional*):
+                The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
+                of the original model given to the `Trainer` (if it comes from the Hub).
+            tasks (`str` or `List[str]`, *optional*):
+                One or several task identifiers, to be included in the metadata of the model card.
+            dataset_tags (`str` or `List[str]`, *optional*):
+                One or several dataset tags, to be included in the metadata of the model card.
+            dataset (`str` or `List[str]`, *optional*):
+                One or several dataset identifiers, to be included in the metadata of the model card.
+            dataset_args (`str` or `List[str]`, *optional*):
+               One or several dataset arguments, to be included in the metadata of the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        training_summary = TrainingSummary.from_trainer(
+            self,
+            language=language,
+            license=license,
+            tags=tags,
+            model_name=model_name,
+            finetuned_from=finetuned_from,
+            tasks=tasks,
+            dataset_tags=dataset_tags,
+            dataset=dataset,
+            dataset_args=dataset_args,
+        )
+        model_card = training_summary.to_model_card()
+        with open(os.path.join(self.args.output_dir, "README.md"), "w") as f:
+            f.write(model_card)
+
+    def _push_from_checkpoint(self, checkpoint_folder):
+        # Only push from one node.
+        if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
+            return
+        # If we haven't finished the last push, we don't do this one.
+        if self.push_in_progress is not None and not self.push_in_progress.is_done:
+            return
+
+        output_dir = self.args.output_dir
+        # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
+        modeling_files = [CONFIG_NAME, WEIGHTS_NAME]
+        for modeling_file in modeling_files:
+            if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
+                shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
+        # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure.
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(output_dir)
+        # Same for the training arguments
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+        try:
+            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
+                # Temporarily move the checkpoint just saved for the push
+                tmp_checkpoint = os.path.join(output_dir, "last-checkpoint")
+                # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a
+                # subfolder.
+                if os.path.isdir(tmp_checkpoint):
+                    shutil.rmtree(tmp_checkpoint)
+                shutil.move(checkpoint_folder, tmp_checkpoint)
+
+            if self.args.save_strategy == IntervalStrategy.STEPS:
+                commit_message = f"Training in progress, step {self.state.global_step}"
+            else:
+                commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
+            _, self.push_in_progress = self.repo.push_to_hub(
+                commit_message=commit_message, blocking=False, auto_lfs_prune=True
+            )
+        finally:
+            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
+                # Move back the checkpoint to its place
+                shutil.move(tmp_checkpoint, checkpoint_folder)
+
+    def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
+        """
+        Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
+
+        Parameters:
+            commit_message (`str`, *optional*, defaults to `"End of training"`):
+                Message to commit while pushing.
+            blocking (`bool`, *optional*, defaults to `True`):
+                Whether the function should return only when the `git push` has finished.
+            kwargs:
+                Additional keyword arguments passed along to [`~Trainer.create_model_card`].
+
+        Returns:
+            The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of
+            the commit and an object to track the progress of the commit if `blocking=True`
+        """
+        # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, we try to create the repo but
+        # it might fail.
+        if not hasattr(self, "repo"):
+            self.init_git_repo()
+
+        model_name = kwargs.pop("model_name", None)
+        if model_name is None and self.args.should_save:
+            if self.args.hub_model_id is None:
+                model_name = Path(self.args.output_dir).name
+            else:
+                model_name = self.args.hub_model_id.split("/")[-1]
+
+        # Needs to be executed on all processes for TPU training, but will only save on the processed determined by
+        # self.args.should_save.
+        self.save_model(_internal_call=True)
+
+        # Only push from one node.
+        if not self.is_world_process_zero():
+            return
+
+        # Cancel any async push in progress if blocking=True. The commits will all be pushed together.
+        if blocking and self.push_in_progress is not None and not self.push_in_progress.is_done:
+            self.push_in_progress._process.kill()
+            self.push_in_progress = None
+
+        git_head_commit_url = self.repo.push_to_hub(
+            commit_message=commit_message, blocking=blocking, auto_lfs_prune=True
+        )
+        # push separately the model card to be independant from the rest of the model
+        if self.args.should_save:
+            self.create_model_card(model_name=model_name, **kwargs)
+            try:
+                self.repo.push_to_hub(
+                    commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True
+                )
+            except EnvironmentError as exc:
+                logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
+
+        return git_head_commit_url
+
+    #
+    # Deprecated code
+    #
+
+    def prediction_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        args = self.args
+
+        if not has_length(dataloader):
+            raise ValueError("dataloader must implement a working __len__")
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+        # if eval is called w/o train init deepspeed here
+        if args.deepspeed and not self.deepspeed:
+            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
+            # from the checkpoint eventually
+            deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
+            # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
+            # for example the Z3-optimizer is a must for zero3 to work even for inference - what we
+            # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
+            deepspeed_engine.optimizer.optimizer = None
+            deepspeed_engine.lr_scheduler = None
+
+        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
+
+        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
+        # while ``train`` is running, cast it to the right dtype first and then put on device
+        if not self.is_in_train:
+            if args.fp16_full_eval:
+                model = model.to(dtype=torch.float16, device=args.device)
+            elif args.bf16_full_eval:
+                model = model.to(dtype=torch.bfloat16, device=args.device)
+
+        batch_size = dataloader.batch_size
+        num_examples = self.num_examples(dataloader)
+        logger.info(f"***** Running {description} *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Batch size = {batch_size}")
+        losses_host: torch.Tensor = None
+        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
+
+        world_size = max(1, args.world_size)
+
+        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
+        if not prediction_loss_only:
+            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
+            # a batch size to the sampler)
+            make_multiple_of = None
+            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
+                make_multiple_of = dataloader.sampler.batch_size
+            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+
+        model.eval()
+
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
+
+        if args.past_index >= 0:
+            self._past = None
+
+        self.callback_handler.eval_dataloader = dataloader
+
+        for step, inputs in enumerate(dataloader):
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+
+            if loss is not None:
+                losses = loss.repeat(batch_size)
+                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+            if logits is not None:
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+            if labels is not None:
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+                if not prediction_loss_only:
+                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+        if not prediction_loss_only:
+            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
+
+        eval_loss = eval_losses_gatherer.finalize()
+        preds = preds_gatherer.finalize() if not prediction_loss_only else None
+        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
+        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
+
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if eval_loss is not None:
+            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples)
+
+    def _gather_and_numpify(self, tensors, name):
+        """
+        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
+        concatenating them to `gathered`
+        """
+        if tensors is None:
+            return
+        if is_torch_tpu_available():
+            tensors = nested_xla_mesh_reduce(tensors, name)
+        elif is_sagemaker_mp_enabled():
+            tensors = smp_gather(tensors)
+        elif self.args.local_rank != -1:
+            tensors = distributed_concat(tensors)
+
+        return nested_numpify(tensors)
+
+    def _add_sm_patterns_to_gitignore(self) -> None:
+        """Add SageMaker Checkpointing patterns to .gitignore file."""
+        # Make sure we only do this on the main process
+        if not self.is_world_process_zero():
+            return
+
+        patterns = ["*.sagemaker-uploading", "*.sagemaker-uploaded"]
+
+        # Get current .gitignore content
+        if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")):
+            with open(os.path.join(self.repo.local_dir, ".gitignore"), "r") as f:
+                current_content = f.read()
+        else:
+            current_content = ""
+
+        # Add the patterns to .gitignore
+        content = current_content
+        for pattern in patterns:
+            if pattern not in content:
+                if content.endswith("\n"):
+                    content += pattern
+                else:
+                    content += f"\n{pattern}"
+
+        # Write the .gitignore file if it has changed
+        if content != current_content:
+            with open(os.path.join(self.repo.local_dir, ".gitignore"), "w") as f:
+                logger.debug(f"Writing .gitignore file. Content: {content}")
+                f.write(content)
+
+        self.repo.git_add(".gitignore")
+
+        # avoid race condition with git status
+        time.sleep(0.5)
+
+        if not self.repo.is_repo_clean():
+            self.repo.git_commit("Add *.sagemaker patterns to .gitignore.")
+            self.repo.git_push()
diff --git a/ptuning/trainer_seq2seq.py b/ptuning/trainer_seq2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d5cf12a274944a3ea3ce689414eab72636e0bd
--- /dev/null
+++ b/ptuning/trainer_seq2seq.py
@@ -0,0 +1,247 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+from trainer import Trainer
+from transformers.trainer_utils import PredictionOutput
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Seq2SeqTrainer(Trainer):
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        **gen_kwargs
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
+                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def predict(
+        self,
+        test_dataset: Dataset,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "test",
+        **gen_kwargs
+    ) -> PredictionOutput:
+        """
+        Run prediction and returns predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        <Tip>
+
+        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
+
+        </Tip>
+
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+
+        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+
+        # XXX: adapt synced_gpus for fairscale as well
+        gen_kwargs = self._gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.model.config.max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
+        )
+        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
+        gen_kwargs["synced_gpus"] = (
+            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
+        )
+
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "position_ids" in inputs:
+            gen_kwargs["position_ids"] = inputs.get("position_ids", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+
+        # prepare generation inputs
+        # some encoder-decoder models can have varying encoder's and thus
+        # varying model input names
+        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
+            generation_inputs = inputs[self.model.encoder.main_input_name]
+        else:
+            generation_inputs = inputs[self.model.main_input_name]
+
+        gen_kwargs["input_ids"] = generation_inputs
+        generated_tokens = self.model.generate(**gen_kwargs)
+        generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
+
+        # in case the batch is shorter than max length, the output should be padded
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+            gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+
+        loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        if has_labels:
+            labels = inputs["labels"]
+            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
+                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
+                gen_kwargs["max_new_tokens"] + 1
+            ):
+                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+        else:
+            labels = None
+
+        return (loss, generated_tokens, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+            # If PAD token is not defined at least EOS token has to be defined
+            pad_token_id = (
+                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            )
+        else:
+            if self.model.config.pad_token_id is not None:
+                pad_token_id = self.model.config.pad_token_id
+            else:
+                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
+
+        padded_tensor = pad_token_id * torch.ones(
+            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
+        )
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor
diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8ada2bdf39c8297dc2b3159270227c587bd13e9
--- /dev/null
+++ b/pytorch_model.bin.index.json
@@ -0,0 +1,375 @@
+{
+  "metadata": {
+    "total_size": 13744473856
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00008-of-00008.bin",
+    "transformer.final_layernorm.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.final_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.0.attention.dense.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.attention.dense.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.attention.dense.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.attention.dense.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "transformer.layers.10.attention.dense.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.attention.dense.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.attention.query_key_value.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.attention.query_key_value.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.input_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.post_attention_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.10.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.attention.dense.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.attention.dense.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.attention.query_key_value.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.attention.query_key_value.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.input_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.post_attention_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.11.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.attention.dense.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.attention.dense.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.attention.query_key_value.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.attention.query_key_value.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.input_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.post_attention_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.12.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.attention.dense.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.attention.dense.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.attention.query_key_value.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.attention.query_key_value.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.input_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.post_attention_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.13.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.attention.dense.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.attention.dense.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.attention.query_key_value.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.attention.query_key_value.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.input_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.post_attention_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.14.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.attention.dense.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.attention.dense.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.attention.query_key_value.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.attention.query_key_value.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.input_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.post_attention_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.15.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.16.attention.dense.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.attention.dense.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.attention.query_key_value.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.attention.query_key_value.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.16.input_layernorm.bias": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.16.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "transformer.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.post_attention_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.16.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.attention.dense.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.attention.dense.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.attention.query_key_value.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.attention.query_key_value.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.input_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.post_attention_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.17.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.attention.dense.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.attention.dense.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.attention.query_key_value.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.attention.query_key_value.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.input_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.post_attention_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.18.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.attention.dense.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.attention.dense.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.attention.query_key_value.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.attention.query_key_value.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.input_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.post_attention_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.19.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.2.attention.dense.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.attention.dense.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.attention.query_key_value.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.attention.query_key_value.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.input_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.post_attention_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.2.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.20.attention.dense.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.attention.dense.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.attention.query_key_value.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.attention.query_key_value.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.input_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.post_attention_layernorm.bias": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.20.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "transformer.layers.21.attention.dense.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.attention.dense.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.attention.query_key_value.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.attention.query_key_value.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.attention.rotary_emb.inv_freq": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.input_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.post_attention_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.21.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.attention.dense.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.attention.dense.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.attention.query_key_value.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.attention.query_key_value.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.attention.rotary_emb.inv_freq": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.input_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.post_attention_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.22.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.attention.dense.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.attention.dense.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.attention.query_key_value.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.attention.query_key_value.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.attention.rotary_emb.inv_freq": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.input_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.post_attention_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.23.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.attention.dense.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.attention.dense.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.attention.query_key_value.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.attention.query_key_value.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.attention.rotary_emb.inv_freq": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.input_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.post_attention_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.24.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.attention.dense.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.attention.dense.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.attention.query_key_value.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.attention.query_key_value.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.attention.rotary_emb.inv_freq": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.input_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.25.post_attention_layernorm.bias": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.25.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "transformer.layers.26.attention.dense.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.attention.dense.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.attention.query_key_value.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.attention.query_key_value.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.attention.rotary_emb.inv_freq": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.input_layernorm.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.post_attention_layernorm.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.26.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.attention.dense.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.attention.dense.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.attention.query_key_value.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.attention.query_key_value.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.attention.rotary_emb.inv_freq": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.input_layernorm.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.post_attention_layernorm.bias": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.27.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "transformer.layers.3.attention.dense.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.attention.dense.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.attention.query_key_value.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.attention.query_key_value.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.input_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.post_attention_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.3.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.attention.dense.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.attention.dense.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.attention.query_key_value.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.attention.query_key_value.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.input_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.post_attention_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.attention.dense.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.attention.dense.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.attention.query_key_value.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.attention.query_key_value.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.input_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.post_attention_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.attention.dense.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.attention.dense.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.attention.query_key_value.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.attention.query_key_value.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.input_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.6.post_attention_layernorm.bias": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "transformer.layers.7.attention.dense.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.attention.dense.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.attention.query_key_value.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.attention.query_key_value.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.input_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.post_attention_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.7.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.attention.dense.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.attention.dense.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.attention.query_key_value.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.attention.query_key_value.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.input_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.post_attention_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.8.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.attention.dense.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.attention.dense.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.attention.query_key_value.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.attention.query_key_value.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.input_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.post_attention_layernorm.bias": "pytorch_model-00003-of-00008.bin",
+    "transformer.layers.9.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "transformer.word_embeddings.weight": "pytorch_model-00001-of-00008.bin"
+  }
+}
diff --git a/quantization.py b/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f469f6a25a8233fe881608168daeba0bc809540
--- /dev/null
+++ b/quantization.py
@@ -0,0 +1,201 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    if source_bit_width == 8:
+        func = kernels.int8WeightExtractionHalf
+    elif source_bit_width == 4:
+        func = kernels.int4WeightExtractionHalf
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(Linear):
+    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs):
+        super(QuantizedLinear, self).__init__(*args, **kwargs)
+        self.weight_bit_width = weight_bit_width
+
+        shape = self.weight.shape
+        del self.weight
+
+        if weight_tensor is None or empty_init:
+            self.weight = torch.empty(
+                shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
+            )
+            self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
+        else:
+            self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+            self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
+        if bias_tensor is not None:
+            self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False)
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, **kwargs):
+    """Replace fp16 linear with quantized linear"""
+
+    for layer in model.layers:
+        layer.attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.query_key_value.bias,
+            in_features=layer.attention.query_key_value.in_features,
+            out_features=layer.attention.query_key_value.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.query_key_value.weight.device,
+            empty_init=empty_init
+        )
+        layer.attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.attention.dense.bias,
+            in_features=layer.attention.dense.in_features,
+            out_features=layer.attention.dense.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.attention.dense.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_h_to_4h.bias,
+            in_features=layer.mlp.dense_h_to_4h.in_features,
+            out_features=layer.mlp.dense_h_to_4h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_h_to_4h.weight.device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight_tensor=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias_tensor=layer.mlp.dense_4h_to_h.bias,
+            in_features=layer.mlp.dense_4h_to_h.in_features,
+            out_features=layer.mlp.dense_4h_to_h.out_features,
+            bias=True,
+            dtype=torch.half,
+            device=layer.mlp.dense_4h_to_h.weight.device,
+            empty_init=empty_init
+        )
+    return model
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb8d79f7519d03793281b87b132d43bd17b85784
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+protobuf
+transformers==4.27.1
+cpm_kernels
+torch>=1.10
+gradio
+mdtex2html
+sentencepiece
+accelerate
\ No newline at end of file
diff --git a/tokenization_chatglm.py b/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f0ba532543b6dbdacdd83d30324b7a6abfad3
--- /dev/null
+++ b/tokenization_chatglm.py
@@ -0,0 +1,430 @@
+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+
+
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+    def __len__(self):
+        return self.num_tokens
+
+
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            pad_token="<pad>",
+            unk_token="<unk>",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+
+        """ Initialisation """
+
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+
+        seq = self.sp_tokenizer.tokenize(text)
+
+        return seq
+
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return self.sp_tokenizer.decode(token_ids)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        gmask_id = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+
+            if "position_ids" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8221e09d53d36aac30bffac7e25888b2dad3743
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,20 @@
+{
+  "name_or_path": "THUDM/chatglm-6b",
+  "bos_token": "<sop>",
+  "eos_token": "<eop>",
+  "end_token": "</s>",
+  "gmask_token": "[gMASK]",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>",
+  "remove_space": false,
+  "do_lower_case": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "num_image_tokens": 0,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+      ]
+  }
+}
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..45015f97b439bce90a813c67dfd304d3dc68cbf5
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,54 @@
+import os
+from typing import Dict, Tuple, Union, Optional
+
+from torch.nn import Module
+from transformers import AutoModel
+
+
+def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
+    # transformer.word_embeddings 占用1层
+    # transformer.final_layernorm 和 lm_head 占用1层
+    # transformer.layers 占用 28 层
+    # 总共30层分配到num_gpus张卡上
+    num_trans_layers = 28
+    per_gpu_layers = 30 / num_gpus
+
+    # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
+    # windows下 model.device 会被设置成 transformer.word_embeddings.device
+    # linux下 model.device 会被设置成 lm_head.device
+    # 在调用chat或者stream_chat时,input_ids会被放到model.device上
+    # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
+    # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
+    device_map = {'transformer.word_embeddings': 0,
+                  'transformer.final_layernorm': 0, 'lm_head': 0}
+
+    used = 2
+    gpu_target = 0
+    for i in range(num_trans_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0
+        assert gpu_target < num_gpus
+        device_map[f'transformer.layers.{i}'] = gpu_target
+        used += 1
+
+    return device_map
+
+
+def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
+    if num_gpus < 2 and device_map is None:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
+    else:
+        from accelerate import dispatch_model
+
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
+
+        if device_map is None:
+            device_map = auto_configure_device_map(num_gpus)
+
+        model = dispatch_model(model, device_map=device_map)
+
+    return model
+
+
diff --git "a/\346\225\260\346\215\256\345\244\204\347\220\206.ipynb" "b/\346\225\260\346\215\256\345\244\204\347\220\206.ipynb"
new file mode 100644
index 0000000000000000000000000000000000000000..30401a183bcb323ac0ef6a011cde4a89f83c3e03
--- /dev/null
+++ "b/\346\225\260\346\215\256\345\244\204\347\220\206.ipynb"
@@ -0,0 +1,492 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from random import sample\n",
+    "\n",
+    "N=100#数据条数\n",
+    "\n",
+    "dat=[]\n",
+    "with open('final_test.json','r',encoding='utf-8') as f:\n",
+    "    for line in f.readlines():\n",
+    "        dat.append(json.loads(line))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def f(l):\n",
+    "    s=''\n",
+    "    for index,value in enumerate(l):\n",
+    "        if index>0:\n",
+    "            s+='、'\n",
+    "        s+=value\n",
+    "    return s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>crime</th>\n",
+       "      <th>law</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>诈骗</td>\n",
+       "      <td>266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>合同诈骗</td>\n",
+       "      <td>224</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>故意伤害</td>\n",
+       "      <td>234</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>故意伤害</td>\n",
+       "      <td>234</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>拒不支付劳动报酬</td>\n",
+       "      <td>276</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>非法吸收公众存款</td>\n",
+       "      <td>176</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>非法处置查封、扣押、冻结的财产</td>\n",
+       "      <td>314</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>危险驾驶</td>\n",
+       "      <td>133</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>盗窃</td>\n",
+       "      <td>264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>交通肇事</td>\n",
+       "      <td>133</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>危险驾驶</td>\n",
+       "      <td>133</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>盗窃</td>\n",
+       "      <td>264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>诈骗</td>\n",
+       "      <td>266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>盗窃</td>\n",
+       "      <td>264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>危险驾驶</td>\n",
+       "      <td>133</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>盗窃</td>\n",
+       "      <td>264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>故意伤害</td>\n",
+       "      <td>234</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>故意伤害</td>\n",
+       "      <td>234</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>走私、贩卖、运输、制造毒品</td>\n",
+       "      <td>347</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>拒不支付劳动报酬</td>\n",
+       "      <td>276</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              crime  law\n",
+       "0                诈骗  266\n",
+       "1              合同诈骗  224\n",
+       "2              故意伤害  234\n",
+       "3              故意伤害  234\n",
+       "4          拒不支付劳动报酬  276\n",
+       "5          非法吸收公众存款  176\n",
+       "6   非法处置查封、扣押、冻结的财产  314\n",
+       "7              危险驾驶  133\n",
+       "8                盗窃  264\n",
+       "9              交通肇事  133\n",
+       "10             危险驾驶  133\n",
+       "11               盗窃  264\n",
+       "12               诈骗  266\n",
+       "13               盗窃  264\n",
+       "14             危险驾驶  133\n",
+       "15               盗窃  264\n",
+       "16             故意伤害  234\n",
+       "17             故意伤害  234\n",
+       "18    走私、贩卖、运输、制造毒品  347\n",
+       "19         拒不支付劳动报酬  276"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "new_dat=[]\n",
+    "crime_list=[]\n",
+    "law_list=[]\n",
+    "for i in dat:\n",
+    "    if len(i['meta']['accusation'])*len(i['meta']['relevant_articles'])==1:\n",
+    "        crime_list.append(f(i['meta']['accusation']))\n",
+    "        law_list.append(f(i['meta']['relevant_articles']))\n",
+    "        new_dat.append(i)\n",
+    "dat=new_dat\n",
+    "dat_frame=pd.DataFrame({'crime':crime_list,'law':law_list})\n",
+    "dat_frame.head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a=dat_frame.groupby('crime').nunique().sort_values('law')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "select_crime=['信用卡诈骗','交通肇事','危险驾驶','行贿','贪污']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_dat=[]\n",
+    "law_list=[]\n",
+    "crime_list=[]\n",
+    "test_dat=[]\n",
+    "num_dic={}\n",
+    "for i in dat:\n",
+    "    if i['meta']['accusation'][0] in select_crime:\n",
+    "        num_dic[i['meta']['accusation'][0]]=num_dic.get(i['meta']['accusation'][0],0)+1\n",
+    "        law_list+=i['meta']['relevant_articles']\n",
+    "        crime_list+=i['meta']['accusation']\n",
+    "        if num_dic[i['meta']['accusation'][0]]<=N/5:\n",
+    "            new_dat.append(i)\n",
+    "        else:\n",
+    "            test_dat.append(i)\n",
+    "law_set=set(law_list)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "law_dic={\n",
+    "    '130':'非法携带枪支、弹药、管制刀具或者爆炸性、易燃性、放射性、毒害性、腐蚀性物品，进入公共场所或者公共交通工具，危及公共安全，情节严重的，处三年以下有期徒刑、拘役或者管制。',\n",
+    "    '133':'违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。',\n",
+    "    '196':'有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。',\n",
+    "    '234':'故意伤害他人身体的，处三年以下有期徒刑、拘役或者管制。',\n",
+    "    '266':'诈骗公私财物，数额较大的，处三年以下有期徒刑、拘役或者管制，并处或者单处罚金；数额巨大或者有其他严重情节的，处三年以上十年以下有期徒刑，并处罚金；数额特别巨大或者有其他特别严重情节的，处十年以上有期徒刑或者无期徒刑，并处罚金或者没收财产。本法另有规定的，依照规定。', \n",
+    "    '382':'国家工作人员利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有公共财物的，是贪污罪。受国家机关、国有公司、企业、事业单位、人民团体委托管理、经营国有财产的人员，利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有国有财物的，以贪污论。', \n",
+    "    '383':'对犯贪污罪的，根据情节轻重，分别依照下列规定处罚：（一）个人贪污数额在十万元以上的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产；情节特别严重的，处死刑，并处没收财产。（二）个人贪污数额在五万元以上不满十万元的，处五年以上有期徒刑，可以并处没收财产；情节特别严重的，处无期徒刑，并处没收财产。（三）个人贪污数额在五千元以上不满五万元的，处一年以上七年以下有期徒刑；情节严重的，处七年以上十年以下有期徒刑。个人贪污数额在五千元以上不满一万元，犯罪后有悔改表现、积极退赃的，可以减轻处罚或者免予刑事处罚，由其所在单位或者上级主管机关给予行政处分。（四）个人贪污数额不满五千元，情节较重的，处二年以下有期徒刑或者拘役；情节较轻的，由其所在单位或者上级主管机关酌情给予行政处分。对多次贪污未经处理的，按照累计贪污数额处罚。', \n",
+    "    '389':'为谋取不正当利益，给予国家工作人员以财物的，是行贿罪。在经济往来中，违反国家规定，给予国家工作人员以财物，数额较大的，或者违反国家规定，给予国家工作人员以各种名义的回扣、手续费的，以行贿论处。因被勒索给予国家工作人员以财物，没有获得不正当利益的，不是行贿。', \n",
+    "    '390':'对犯行贿罪的，处五年以下有期徒刑或者拘役；因行贿谋取不正当利益，情节严重的，或者使国家利益遭受重大损失的，处五年以上十年以下有期徒刑；情节特别严重的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产。'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处张某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处唐1某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处黄某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处肖某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处李某甲危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处高某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处陈某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处张某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处朱某伟危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处杨某乐交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处陈某波危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处吴某东危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处吴某涛危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处张某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处汪某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处魏某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处赵某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处王1某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处张某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处刘某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处王某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处李某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处陶某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处佟某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处杨某危险驾驶罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处付某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处王某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处:柴某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处孔某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处郑某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处谢某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处罗2某、罗3某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处常某交通肇事罪。\n",
+      "根据中华人民共和国刑法第266条,诈骗公私财物，数额较大的，处三年以下有期徒刑、拘役或者管制，并处或者单处罚金；数额巨大或者有其他严重情节的，处三年以上十年以下有期徒刑，并处罚金；数额特别巨大或者有其他特别严重情节的，处十年以上有期徒刑或者无期徒刑，并处罚金或者没收财产。本法另有规定的，依照规定。判处李2某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处李1某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处程某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处闫某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处李某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处苏某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处田某交通肇事罪。\n",
+      "根据中华人民共和国刑法第133条,违反交通运输管理法规，因而发生重大事故，致人重伤、死亡或者使公私财产遭受重大损失的，处三年以下有期徒刑或者拘役；交通运输肇事后逃逸或者有其他特别恶劣情节的，处三年以上七年以下有期徒刑；因逃逸致人死亡的，处七年以上有期徒刑。判处吴某交通肇事罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处肖某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处徐某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处梁某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处潘2某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处刘某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处张2某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第390条,对犯行贿罪的，处五年以下有期徒刑或者拘役；因行贿谋取不正当利益，情节严重的，或者使国家利益遭受重大损失的，处五年以上十年以下有期徒刑；情节特别严重的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产。判处薛某某、黎某行贿罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处邹某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处余某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处钱某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处白某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处方某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第390条,对犯行贿罪的，处五年以下有期徒刑或者拘役；因行贿谋取不正当利益，情节严重的，或者使国家利益遭受重大损失的，处五年以上十年以下有期徒刑；情节特别严重的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产。判处唐某、陈某行贿罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处王某某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处勾志某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处郝某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处崔某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处张某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处邓某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处白某某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第196条,有下列情形之一，进行信用卡诈骗活动，（一）使用伪造的信用卡，或者使用以虚假的身份证明骗领的信用卡的；（二）使用作废的信用卡的；（三）冒用他人信用卡的；（四）恶意透支的。判处詹某信用卡诈骗罪。\n",
+      "根据中华人民共和国刑法第383条,对犯贪污罪的，根据情节轻重，分别依照下列规定处罚：（一）个人贪污数额在十万元以上的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产；情节特别严重的，处死刑，并处没收财产。（二）个人贪污数额在五万元以上不满十万元的，处五年以上有期徒刑，可以并处没收财产；情节特别严重的，处无期徒刑，并处没收财产。（三）个人贪污数额在五千元以上不满五万元的，处一年以上七年以下有期徒刑；情节严重的，处七年以上十年以下有期徒刑。个人贪污数额在五千元以上不满一万元，犯罪后有悔改表现、积极退赃的，可以减轻处罚或者免予刑事处罚，由其所在单位或者上级主管机关给予行政处分。（四）个人贪污数额不满五千元，情节较重的，处二年以下有期徒刑或者拘役；情节较轻的，由其所在单位或者上级主管机关酌情给予行政处分。对多次贪污未经处理的，按照累计贪污数额处罚。判处吴某贪污罪。\n",
+      "根据中华人民共和国刑法第383条,对犯贪污罪的，根据情节轻重，分别依照下列规定处罚：（一）个人贪污数额在十万元以上的，处十年以上有期徒刑或者无期徒刑，可以并处没收财产；情节特别严重的，处死刑，并处没收财产。（二）个人贪污数额在五万元以上不满十万元的，处五年以上有期徒刑，可以并处没收财产；情节特别严重的，处无期徒刑，并处没收财产。（三）个人贪污数额在五千元以上不满五万元的，处一年以上七年以下有期徒刑；情节严重的，处七年以上十年以下有期徒刑。个人贪污数额在五千元以上不满一万元，犯罪后有悔改表现、积极退赃的，可以减轻处罚或者免予刑事处罚，由其所在单位或者上级主管机关给予行政处分。（四）个人贪污数额不满五千元，情节较重的，处二年以下有期徒刑或者拘役；情节较轻的，由其所在单位或者上级主管机关酌情给予行政处分。对多次贪污未经处理的，按照累计贪污数额处罚。判处刘1某贪污罪。\n",
+      "根据中华人民共和国刑法第382条,国家工作人员利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有公共财物的，是贪污罪。受国家机关、国有公司、企业、事业单位、人民团体委托管理、经营国有财产的人员，利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有国有财物的，以贪污论。判处余某贪污罪。\n",
+      "根据中华人民共和国刑法第389条,为谋取不正当利益，给予国家工作人员以财物的，是行贿罪。在经济往来中，违反国家规定，给予国家工作人员以财物，数额较大的，或者违反国家规定，给予国家工作人员以各种名义的回扣、手续费的，以行贿论处。因被勒索给予国家工作人员以财物，没有获得不正当利益的，不是行贿。判处阴某行贿罪。\n",
+      "根据中华人民共和国刑法第389条,为谋取不正当利益，给予国家工作人员以财物的，是行贿罪。在经济往来中，违反国家规定，给予国家工作人员以财物，数额较大的，或者违反国家规定，给予国家工作人员以各种名义的回扣、手续费的，以行贿论处。因被勒索给予国家工作人员以财物，没有获得不正当利益的，不是行贿。判处张2某行贿罪。\n",
+      "根据中华人民共和国刑法第382条,国家工作人员利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有公共财物的，是贪污罪。受国家机关、国有公司、企业、事业单位、人民团体委托管理、经营国有财产的人员，利用职务上的便利，侵吞、窃取、骗取或者以其他手段非法占有国有财物的，以贪污论。判处赵某某贪污罪。\n",
+      "根据中华人民共和国刑法第389条,为谋取不正当利益，给予国家工作人员以财物的，是行贿罪。在经济往来中，违反国家规定，给予国家工作人员以财物，数额较大的，或者违反国家规定，给予国家工作人员以各种名义的回扣、手续费的，以行贿论处。因被勒索给予国家工作人员以财物，没有获得不正当利益的，不是行贿。判处石某行贿罪。\n",
+      "根据中华人民共和国刑法第389条,为谋取不正当利益，给予国家工作人员以财物的，是行贿罪。在经济往来中，违反国家规定，给予国家工作人员以财物，数额较大的，或者违反国家规定，给予国家工作人员以各种名义的回扣、手续费的，以行贿论处。因被勒索给予国家工作人员以财物，没有获得不正当利益的，不是行贿。判处陈某行贿罪。\n"
+     ]
+    }
+   ],
+   "source": [
+    "train=''\n",
+    "for i in new_dat:\n",
+    "    temp={}\n",
+    "    temp[\"content\"]=i['fact']\n",
+    "    s='根据中华人民共和国刑法' \n",
+    "    for index,j in enumerate(i['meta']['relevant_articles']):\n",
+    "        if index>0:\n",
+    "            s+='、'\n",
+    "        s+='第'+j+'条,'+law_dic[j]\n",
+    "    s+='判处'\n",
+    "    for index,j in enumerate(i['meta']['criminals']):\n",
+    "        if index>0:\n",
+    "            s+='、'\n",
+    "        s+=j\n",
+    "    s+=i['meta']['accusation'][0]+'罪。'\n",
+    "    print(s)\n",
+    "    temp[\"summary\"]=s\n",
+    "    temp=json.dumps(temp,ensure_ascii=False)\n",
+    "    train+=temp+'\\n'\n",
+    "\n",
+    "with open('train.json','w',encoding='utf-8') as f:\n",
+    "    f.write(train)\n",
+    "\n",
+    "train=''\n",
+    "for i in test_dat:\n",
+    "    temp={}\n",
+    "    temp[\"content\"]=i['fact']\n",
+    "    s='根据中华人民共和国刑法' \n",
+    "    for index,j in enumerate(i['meta']['relevant_articles']):\n",
+    "        if index>1:\n",
+    "            s+='、'\n",
+    "        s+='第'+j+'条,'+law_dic[j]\n",
+    "    s+='，判处'\n",
+    "    for index,j in enumerate(i['meta']['criminals']):\n",
+    "        if index>1:\n",
+    "            s+='、'\n",
+    "        s+=j\n",
+    "    s+=i['meta']['accusation'][0]+'罪。'\n",
+    "    temp[\"summary\"]=s\n",
+    "    temp=json.dumps(temp,ensure_ascii=False)\n",
+    "    train+=temp+'\\n'\n",
+    "\n",
+    "with open('test.json','w',encoding='utf-8') as f:\n",
+    "    f.write(train)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if False:\n",
+    "    import os\n",
+    "    import platform\n",
+    "    import signal\n",
+    "    from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True)\n",
+    "    model = AutoModel.from_pretrained(\"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True).quantize(4).half().cuda()\n",
+    "    model = model.eval()\n",
+    "\n",
+    "    os_name = platform.system()\n",
+    "    clear_command = 'cls' if os_name == 'Windows' else 'clear'\n",
+    "    stop_stream = False\n",
+    "\n",
+    "\n",
+    "    def build_prompt(history):\n",
+    "        prompt = \"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\"\n",
+    "        for query, response in history:\n",
+    "            prompt += f\"\\n\\n用户：{query}\"\n",
+    "            prompt += f\"\\n\\nChatGLM-6B：{response}\"\n",
+    "        return prompt\n",
+    "\n",
+    "\n",
+    "    def signal_handler(signal, frame):\n",
+    "        global stop_stream\n",
+    "        stop_stream = True\n",
+    "\n",
+    "\n",
+    "    def main():\n",
+    "        history = []\n",
+    "        global stop_stream\n",
+    "        print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "        while True:\n",
+    "            query = input(\"\\n用户：\")\n",
+    "            if query.strip() == \"stop\":\n",
+    "                break\n",
+    "            if query.strip() == \"clear\":\n",
+    "                history = []\n",
+    "                os.system(clear_command)\n",
+    "                print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "                continue\n",
+    "            count = 0\n",
+    "            for response, history in model.stream_chat(tokenizer, query, history=history):\n",
+    "                if stop_stream:\n",
+    "                    stop_stream = False\n",
+    "                    break\n",
+    "                else:\n",
+    "                    count += 1\n",
+    "                    if count % 8 == 0:\n",
+    "                        os.system(clear_command)\n",
+    "                        print(build_prompt(history), flush=True)\n",
+    "                        signal.signal(signal.SIGINT, signal_handler)\n",
+    "            os.system(clear_command)\n",
+    "            print(build_prompt(history), flush=True)\n",
+    "\n",
+    "\n",
+    "    if __name__ == \"__main__\":\n",
+    "        main()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git "a/\346\263\225\345\276\213\351\227\256\347\255\224.ipynb" "b/\346\263\225\345\276\213\351\227\256\347\255\224.ipynb"
new file mode 100644
index 0000000000000000000000000000000000000000..3534eb2c00ac38151143b8d484495e66d0fd89c4
--- /dev/null
+++ "b/\346\263\225\345\276\213\351\227\256\347\255\224.ipynb"
@@ -0,0 +1,812 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ChatGLM6B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9b3296313f724fdf955b4fe41ca5302e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at G:\\CODE\\Python\\ChatGLM-6B-main and are newly initialized: ['transformer.prefix_encoder.embedding.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "开始int4量化\n",
+      "量化完成\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立，被告人将被判处相应的刑罚。\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立，被告人将被判处相应的刑罚。\n",
+      "\n",
+      "作为公民，我们应该遵守交通规则\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立，被告人将被判处相应的刑罚。\n",
+      "\n",
+      "作为公民，我们应该遵守交通规则，确保自己和他人的安全。同时\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立，被告人将被判处相应的刑罚。\n",
+      "\n",
+      "作为公民，我们应该遵守交通规则，确保自己和他人的安全。同时，我们应该积极参与交通安全活动，提高交通安全\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：天津市滨海新区人民检察院的指控指控，2016年10月14日18时许，被告人许某醉酒后，无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "根据指控，如果指控成立，被告人将被判处相应的刑罚。\n",
+      "\n",
+      "作为公民，我们应该遵守交通规则，确保自己和他人的安全。同时，我们应该积极参与交通安全活动，提高交通安全意识，减少交通事故。\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为10\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为108.1mg/100\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "因此，根据刑法\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "因此，根据刑法第221条，许某\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "因此，根据刑法第221条，许某的行为符合危险驾驶罪的定义，应当\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：请根据给出的法律文书，判断根据刑法多少条，该判谁什么罪。法律文书如下：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据法律文书，可以确定以下结论：\n",
+      "\n",
+      "1. 被告人许某犯有危险驾驶罪。\n",
+      "\n",
+      "根据法律文书，2016年10月14日18时，被告人许某醉酒后无证驾驶无号牌电动二轮车，并驮带李某，沿港城大道逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁G×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中的酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "因此，根据刑法第221条，许某的行为符合危险驾驶罪的定义，应当判处有期徒刑三年以下刑罚。\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import platform\n",
+    "import signal\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel, AutoConfig\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True)\n",
+    "config = AutoConfig.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True, pre_seq_len=128)\n",
+    "model = AutoModel.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\",  config=config, trust_remote_code=True)\n",
+    "# prefix_state_dict = torch.load(os.path.join(\n",
+    "#     'G:\\CODE\\Python\\ChatGLM-6B-main\\model_2', \"pytorch_model.bin\"))\n",
+    "# new_prefix_state_dict = {}\n",
+    "# for k, v in prefix_state_dict.items():\n",
+    "#     if k.startswith(\"transformer.prefix_encoder.\"):\n",
+    "#         new_prefix_state_dict[k[len(\"transformer.prefix_encoder.\"):]] = v\n",
+    "# model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)\n",
+    "\n",
+    "print('开始int4量化')\n",
+    "model = model.quantize(4)\n",
+    "model = model.half().cuda()\n",
+    "# model.transformer.prefix_encoder.float()\n",
+    "model = model.eval()\n",
+    "print('量化完成')\n",
+    "\n",
+    "os_name = platform.system()\n",
+    "clear_command = 'cls' if os_name == 'Windows' else 'clear'\n",
+    "stop_stream = False\n",
+    "\n",
+    "\n",
+    "def build_prompt(history):\n",
+    "    prompt = \"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\"\n",
+    "    for query, response in history:\n",
+    "        prompt += f\"\\n\\n用户：{query}\"\n",
+    "        prompt += f\"\\n\\nChatGLM-6B：{response}\"\n",
+    "    return prompt\n",
+    "\n",
+    "\n",
+    "def signal_handler(signal, frame):\n",
+    "    global stop_stream\n",
+    "    stop_stream = True\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    history = []\n",
+    "    global stop_stream\n",
+    "    print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "    while True:\n",
+    "        query = input(\"\\n用户：\")\n",
+    "        if query.strip() == \"stop\":\n",
+    "            break\n",
+    "        if query.strip() == \"clear\":\n",
+    "            history = []\n",
+    "            os.system(clear_command)\n",
+    "            print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "            continue\n",
+    "        count = 0\n",
+    "        for response, history in model.stream_chat(tokenizer, query, history=history):\n",
+    "            if stop_stream:\n",
+    "                stop_stream = False\n",
+    "                break\n",
+    "            else:\n",
+    "                count += 1\n",
+    "                if count % 8 == 0:\n",
+    "                    os.system(clear_command)\n",
+    "                    print(build_prompt(history), flush=True)\n",
+    "                    signal.signal(signal.SIGINT, signal_handler)\n",
+    "        os.system(clear_command)\n",
+    "        print(build_prompt(history), flush=True)\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ChatGLM6B-Legal-model1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "951aa03b169d4bc8ad4810c524a172f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at G:\\CODE\\Python\\ChatGLM-6B-main and are newly initialized: ['transformer.prefix_encoder.embedding.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "开始int4量化\n",
+      "量化完成\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml\n",
+      "\n",
+      "ChatGLM-6B：根据刑法第133条，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml\n",
+      "\n",
+      "ChatGLM-6B：根据刑法第133条，判处许某危险驾驶罪\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import platform\n",
+    "import signal\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel, AutoConfig\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True)\n",
+    "config = AutoConfig.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True, pre_seq_len=128)\n",
+    "model = AutoModel.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\",  config=config, trust_remote_code=True)\n",
+    "prefix_state_dict = torch.load(os.path.join(\n",
+    "    'G:\\CODE\\Python\\ChatGLM-6B-main\\model_1', \"pytorch_model.bin\"))\n",
+    "new_prefix_state_dict = {}\n",
+    "for k, v in prefix_state_dict.items():\n",
+    "    if k.startswith(\"transformer.prefix_encoder.\"):\n",
+    "        new_prefix_state_dict[k[len(\"transformer.prefix_encoder.\"):]] = v\n",
+    "model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)\n",
+    "\n",
+    "print('开始int4量化')\n",
+    "model = model.quantize(4)\n",
+    "model = model.half().cuda()\n",
+    "model.transformer.prefix_encoder.float()\n",
+    "model = model.eval()\n",
+    "print('量化完成')\n",
+    "\n",
+    "os_name = platform.system()\n",
+    "clear_command = 'cls' if os_name == 'Windows' else 'clear'\n",
+    "stop_stream = False\n",
+    "\n",
+    "\n",
+    "def build_prompt(history):\n",
+    "    prompt = \"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\"\n",
+    "    for query, response in history:\n",
+    "        prompt += f\"\\n\\n用户：{query}\"\n",
+    "        prompt += f\"\\n\\nChatGLM-6B：{response}\"\n",
+    "    return prompt\n",
+    "\n",
+    "\n",
+    "def signal_handler(signal, frame):\n",
+    "    global stop_stream\n",
+    "    stop_stream = True\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    history = []\n",
+    "    global stop_stream\n",
+    "    print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "    while True:\n",
+    "        query = input(\"\\n用户：\")\n",
+    "        if query.strip() == \"stop\":\n",
+    "            break\n",
+    "        if query.strip() == \"clear\":\n",
+    "            history = []\n",
+    "            os.system(clear_command)\n",
+    "            print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "            continue\n",
+    "        count = 0\n",
+    "        for response, history in model.stream_chat(tokenizer, query, history=history):\n",
+    "            if stop_stream:\n",
+    "                stop_stream = False\n",
+    "                break\n",
+    "            else:\n",
+    "                count += 1\n",
+    "                if count % 8 == 0:\n",
+    "                    os.system(clear_command)\n",
+    "                    print(build_prompt(history), flush=True)\n",
+    "                    signal.signal(signal.SIGINT, signal_handler)\n",
+    "        os.system(clear_command)\n",
+    "        print(build_prompt(history), flush=True)\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ChatGLM6B-Legal-model2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.\n",
+      "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b8887b74a6294911b5c523f9738c2746",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at G:\\CODE\\Python\\ChatGLM-6B-main and are newly initialized: ['transformer.prefix_encoder.embedding.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "开始int4量化\n",
+      "量化完成\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通法规，血液中酒精含量超过80mg\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通法规，血液中酒精含量超过80mg/100ml，即被认定为\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通法规，血液中酒精含量超过80mg/100ml，即被认定为醉驾。因此被告人许某符合危险\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通法规，血液中酒精含量超过80mg/100ml，即被认定为醉驾。因此被告人许某符合危险驾驶罪中的醉酒驾驶机动车，并且严重\n",
+      "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\n",
+      "\n",
+      "用户：天津市滨海新区人民检察院指控，2016年10月14日18时许，被告人许某醉酒后、无证驾驶无号牌电动二轮车驮带李某，沿港城大道由西向东逆向行驶，遇前方顺行的被害人向某超速驾驶的鲁Ｇ×××××号思域牌小轿车。被告人许某驾驶的电动二轮车前部与被害人车辆右侧前部相撞，造成许某、李某受伤，两车损坏的交通事故。经鉴定，被告人许某血液中酒精含量为108.1mg/100ml。\n",
+      "\n",
+      "ChatGLM-6B：根据中华人民共和国刑法第133条，判处被告人许某危险驾驶罪。其中被告人许某驾驶无号牌电动二轮车驮带李某沿港城大道由西向东逆向行驶，经鉴定，被告人许某血液中酒精含量为108.1mg/100ml，根据中国的交通法规，血液中酒精含量超过80mg/100ml，即被认定为醉驾。因此被告人许某符合危险驾驶罪中的醉酒驾驶机动车，并且严重超过规定时速行驶。\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import platform\n",
+    "import signal\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel, AutoConfig\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True)\n",
+    "config = AutoConfig.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\", trust_remote_code=True, pre_seq_len=128)\n",
+    "model = AutoModel.from_pretrained(\n",
+    "    \"G:\\CODE\\Python\\ChatGLM-6B-main\",  config=config, trust_remote_code=True)\n",
+    "prefix_state_dict = torch.load(os.path.join(\n",
+    "    'G:\\CODE\\Python\\ChatGLM-6B-main\\model_2', \"pytorch_model.bin\"))\n",
+    "new_prefix_state_dict = {}\n",
+    "for k, v in prefix_state_dict.items():\n",
+    "    if k.startswith(\"transformer.prefix_encoder.\"):\n",
+    "        new_prefix_state_dict[k[len(\"transformer.prefix_encoder.\"):]] = v\n",
+    "model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)\n",
+    "\n",
+    "print('开始int4量化')\n",
+    "model = model.quantize(4)\n",
+    "model = model.half().cuda()\n",
+    "model.transformer.prefix_encoder.float()\n",
+    "model = model.eval()\n",
+    "print('量化完成')\n",
+    "\n",
+    "os_name = platform.system()\n",
+    "clear_command = 'cls' if os_name == 'Windows' else 'clear'\n",
+    "stop_stream = False\n",
+    "\n",
+    "\n",
+    "def build_prompt(history):\n",
+    "    prompt = \"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\"\n",
+    "    for query, response in history:\n",
+    "        prompt += f\"\\n\\n用户：{query}\"\n",
+    "        prompt += f\"\\n\\nChatGLM-6B：{response}\"\n",
+    "    return prompt\n",
+    "\n",
+    "\n",
+    "def signal_handler(signal, frame):\n",
+    "    global stop_stream\n",
+    "    stop_stream = True\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    history = []\n",
+    "    global stop_stream\n",
+    "    print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "    while True:\n",
+    "        query = input(\"\\n用户：\")\n",
+    "        if query.strip() == \"stop\":\n",
+    "            break\n",
+    "        if query.strip() == \"clear\":\n",
+    "            history = []\n",
+    "            os.system(clear_command)\n",
+    "            print(\"欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序\")\n",
+    "            continue\n",
+    "        count = 0\n",
+    "        for response, history in model.stream_chat(tokenizer, query, history=history):\n",
+    "            if stop_stream:\n",
+    "                stop_stream = False\n",
+    "                break\n",
+    "            else:\n",
+    "                count += 1\n",
+    "                if count % 8 == 0:\n",
+    "                    os.system(clear_command)\n",
+    "                    print(build_prompt(history), flush=True)\n",
+    "                    signal.signal(signal.SIGINT, signal_handler)\n",
+    "        os.system(clear_command)\n",
+    "        print(build_prompt(history), flush=True)\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}