gosshh commited on Feb 5

Commit

42e6757

verified ·

1 Parent(s): e2c082f

Upload 25 files

Browse files

Files changed (25) hide show

.gitattributes +2 -0
LICENSE +94 -0
LICENSE.md +267 -0
README.md +96 -3
attention.py +387 -0
blocks.py +55 -0
config.json +84 -0
configuration_mpt.py +183 -0
fc.py +7 -0
ffn.py +96 -0
flash_attn_triton.py +484 -0
generation_config.json +5 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +653 -0
modeling_mpt.py +540 -0
norm.py +57 -0
special_tokens_map.json +36 -0
tokenizer.json +0 -0
tokenizer_config.json +1758 -0
trainer_state.json +0 -0
training_args.bin +3 -0
warnings.py +22 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/radar.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,94 @@

+Krutrim Community License Agreement Version 1.0
+1. Definitions:
+"Software" refers to the code, documentation, models, APIs, libraries, scripts, and any associated materials provided under this license, relating to Krutrim Community License including updates, modifications, enhancements, and derivative works.
+"You" or "Licensee" refers to the individual, organization, or legal entity exercising rights under this license, including its employees, contractors, and affiliates.
+"Modification" means any alteration to the Software, including but not limited to changes, improvements, enhancements, translations, adaptations, or derivative works based on the original Software.
+"Commercial Use" means any use of the Software intended for commercial advantage or monetary compensation where more than 1 million monthly active users of the Licensee use the Software.
+"Distribution" refers to the act of making the Software available to third parties through any means, including but not limited to physical media, downloads, or cloud-based platforms.
+2. Grant of License:
+Subject to the terms and conditions of this Agreement, Krutrim grants you a worldwide, non-exclusive, non-transferable, revocable limited license to:
+Use the Software for permitted purposes as outlined in this license, including research, academic, and personal projects.
+Modify the Software for research and personal use, provided that all changes are clearly documented, and proper attribution is maintained.
+Distribute the Software under specified conditions, ensuring compliance with attribution, modification transparency, and adherence to non-commercial restrictions where applicable.
+This license does not constitute a sale of the Software and does not grant ownership rights. Krutrim retains all intellectual property rights not explicitly granted herein.
+3. Permitted Uses:
+Research and Personal Use: Free to use, modify, and distribute for academic, educational, research, and personal purposes, provided proper attribution to Krutrim is included. This includes use in scientific studies, data analysis, and personal development projects.
+Educational Use: Permitted in teaching, training, academic projects, and coursework without commercial exploitation. Use in online courses, educational programs, and classroom environments is encouraged, provided attribution is maintained.
+Commercial Use: Commercial Use is permitted only through a separate commercial license agreement with Krutrim, which Krutrim may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Krutrim otherwise expressly grants you such rights.
+4. Prohibited Use: You agree you will not use, or allow others to use, Krutrim Community License to:
+4.1. Violate the law or others’ rights, including to:
+a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+i. Violence or terrorism
+ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+iii. Human trafficking, exploitation, and sexual violence
+iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+v. Sexual solicitation
+vi. Any other criminal activity
+b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Krutrim Community License or Software therein
+g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+4.2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Krutrim Community License related to the following:
+a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are prohibited under applicable laws
+b. Guns and illegal weapons (including weapon development)
+c. Illegal drugs and regulated/controlled substances
+d. Operation of critical infrastructure, transportation technologies, or heavy machinery
+e. Self-harm or harm to others, including suicide, cutting, and eating disorders
+f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+4.3. Intentionally deceive or mislead others, including use of Krutrim Community License related to the following:
+a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+c. Generating, promoting, or further distributing spam
+d. Impersonating another individual without consent, authorization, or legal right
+e. Representing that the use of Krutrim Community License  or outputs are human-generated
+f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+5. Restriction on Use to compete with Krutrim:
+The Software may not be used, directly or indirectly, by any person or any entity to develop, market, sell, or otherwise support products or services that compete with Krutrim’s core offerings.
+This restriction applies to both internal development activities and any derivative works or products created based on the Software.
+Violation of this clause will result in immediate termination of this license, along with potential legal action to seek damages and injunctive relief.
+6. Modification Rights:
+You may modify the Software for personal and research purposes. All modifications must clearly document the changes made, including the date of modification, nature of the changes, and the author responsible.
+Modified versions must retain this license, including proper attribution to Krutrim. Documentation of modifications should accompany any distributed versions.
+Modified versions may not be distributed for Commercial Use without a separate agreement. Any commercial distribution of modified versions requires prior written consent from Krutrim.
+7. Distribution:
+Non-Commercial Distribution: Allowed for academic and research purposes with proper attribution and inclusion of this license. Redistribution in open-source repositories or educational platforms is encouraged, provided attribution is visible.
+Commercial Distribution: Prohibited unless authorized under a separate commercial agreement. This includes distribution through commercial platforms, integration into proprietary products, or bundling with paid services.
+Distribution Requirements: All distributed copies, whether modified or unmodified, must include a copy of this license, clear attribution to Krutrim, and a description of any modifications. Distributors must ensure recipients are aware of these license terms.
+8. Patent Rights:
+This license does not grant any express or implied patent rights. If your use of the Software involves activities that require patent rights, you must obtain a separate license from the respective patent holders. Krutrim makes no representations regarding third-party patent claims. Users are responsible for ensuring their use does not infringe on existing patents.
+9. Warranty Disclaimer:
+The Software is provided "AS IS," without warranty of any kind, express or implied. This includes, but is not limited to, warranties of merchantability, fitness for a particular purpose, non-infringement, or the absence of latent or other defects. You assume all risks associated with the use of the Software. Krutrim disclaims any responsibility for errors, bugs, or vulnerabilities that may arise.
+10. Attribution Requirements:
+You must provide clear, visible attribution to "Krutrim" in any publication, presentation, distribution, or derivative work related to or connected with the Software. Attribution must appear in prominent locations such as documentation, user interfaces, academic papers, and software metadata.
+The attribution should include the original project name, a link to the source repository or project homepage, and a statement acknowledging Krutrim’s contribution. Failure to provide proper attribution constitutes a breach of this license.
+11. Cloud Deployment:
+Permitted: Deployment for academic, personal, and non-commercial research purposes is allowed, provided proper attribution to Krutrim is maintained. This includes hosting on private or public cloud platforms for non-commercial projects.
+Restricted: Commercial cloud deployment, including SaaS offerings, for Commercial Use requires a separate commercial license agreement. This includes services that generate revenue through subscriptions, advertisements, or other commercial models.
+12. Research Use:
+Permitted for academic, non-commercial research, provided proper attribution to Krutrim is included in any resulting publications, datasets, or presentations. Research may include data analysis, model training, simulations, and academic collaborations. Any research by a Licensee that is exploited commercially for economic gains would be construed as commercial use if it is used by more than the number of users identified under “Commercial Use”.
+Research outputs, such as papers, models, or software, must cite Krutrim where the Software significantly contributed to the results. Failure to provide proper citation may result in license termination.
+13. Limitation of Liability:
+In no event shall Krutrim or its contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages. This includes, but is not limited to, loss of data, profits, business interruptions, or any other commercial damages or losses, arising out of or in connection with the use or inability to use the Software, even if advised of the possibility of such damages. Users assume full responsibility for their reliance on the Software.
+14. Sub-licensing:
+Sub-licensing is strictly prohibited. You may not grant, assign, or otherwise transfer any rights under this license to third parties. You may not impose additional restrictions on the Software beyond those specified in this license. Any attempt to sub-license will render this license null and void.
+15. Derivative Works and Model Training:
+Permitted: You may train derivative models or develop derivative works for non-commercial, academic, or research purposes, provided you comply with attribution requirements. Derivative works should be clearly identified as modifications of the original Software.
+Restricted: Any Commercial Use of derivative models, including monetization, deployment in commercial products, or offering as a service, requires explicit written permission from Krutrim. This includes using trained models in SaaS platforms, APIs, or enterprise software solutions.
+16. Compliance and Audits:
+Krutrim reserves the right to request proof of compliance with this license. This may include, but is not limited to, documentation of modifications, usage logs, and details of distribution. Failure to provide such documentation may result in license termination. Krutrim may conduct audits to ensure compliance, with reasonable notice provided to the Licensee.
+17. Termination:
+This license will terminate automatically if you fail to comply with any of its terms. Violations may include unauthorized commercial use, failure to provide attribution, or non-compliance with distribution requirements.
+Upon termination, you must immediately cease all use, modification, and distribution of the Software and destroy all copies in your possession or control.
+Termination does not relieve you of obligations accrued prior to termination, including any liabilities for breaches that occurred before termination.
+18. Governing Law and Dispute Resolution:
+This Agreement shall be governed by and construed in accordance with the laws of India, without regard to its conflict of law principles.
+Any disputes arising from this Agreement shall be subject to arbitration under the Indian Arbitration and Conciliation Act, 1996. The arbitration will be conducted in English, with the venue in Bangalore, India. The decision of the arbitrator shall be final and binding on all parties.
+19. Severability:
+If any provision of this license is found to be invalid, illegal, or unenforceable, the remaining provisions shall continue in full force and effect. Any invalid provision shall be replaced with a valid one that comes closest to the original intent, ensuring the overall purpose of the license is preserved.
+20. Entire Agreement:
+This license constitutes the complete and exclusive agreement between you and Krutrim concerning the Software. It supersedes any prior or contemporaneous agreements, communications, or understandings, whether written or oral. Any modifications to this license must be in writing and signed by both parties.
+By using the Software, you acknowledge that you have read, understood, and agree to be bound by the terms of this license.

LICENSE.md ADDED Viewed

	@@ -0,0 +1,267 @@

+# **Krutrim Community License Agreement Version 1.0**
+### **1\. Definitions:**
+* "**Software**" refers to the code, documentation, models, APIs, libraries, scripts, and any associated materials provided under this license, relating to Krutrim Community License including updates, modifications, enhancements, and derivative works.
+* "**You**" or "**Licensee**" refers to the individual, organization, or legal entity exercising rights under this license, including its employees, contractors, and affiliates.
+* "**Modification**" means any alteration to the Software, including but not limited to changes, improvements, enhancements, translations, adaptations, or derivative works based on the original Software.
+* "**Commercial** **Use**" means any use of the Software intended for commercial advantage or monetary compensation where more than 1 million monthly active users of the Licensee use the Software.
+* "**Distribution**" refers to the act of making the Software available to third parties through any means, including but not limited to physical media, downloads, or cloud-based platforms.
+### **2\. Grant of License:**
+Subject to the terms and conditions of this Agreement, Krutrim grants you a worldwide, non-exclusive, non-transferable, revocable limited license to:
+* Use the Software for permitted purposes as outlined in this license, including research, academic, and personal projects.
+* Modify the Software for research and personal use, provided that all changes are clearly documented, and proper attribution is maintained.
+* Distribute the Software under specified conditions, ensuring compliance with attribution, modification transparency, and adherence to non-commercial restrictions where applicable.
+This license does not constitute a sale of the Software and does not grant ownership rights. Krutrim retains all intellectual property rights not explicitly granted herein.
+### **3\. Permitted Uses:**
+* Research and Personal Use: Free to use, modify, and distribute for academic, educational, research, and personal purposes, provided proper attribution to Krutrim is included. This includes use in scientific studies, data analysis, and personal development projects.
+* Educational Use: Permitted in teaching, training, academic projects, and coursework without commercial exploitation. Use in online courses, educational programs, and classroom environments is encouraged, provided attribution is maintained.
+* Commercial Use: Commercial Use is permitted only through a separate commercial license agreement with Krutrim, which Krutrim may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Krutrim otherwise expressly grants you such rights.
+**4\. Prohibited Use:** You agree you will not use, or allow others to use, Krutrim Community License to:
+4.1. Violate the law or others’ rights, including to:
+a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+i. Violence or terrorism
+ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+iii. Human trafficking, exploitation, and sexual violence
+iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+v. Sexual solicitation
+vi. Any other criminal activity
+b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Krutrim Community License or Software therein
+g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+4.2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Krutrim Community License related to the following:
+a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are prohibited under applicable laws
+b. Guns and illegal weapons (including weapon development)
+c. Illegal drugs and regulated/controlled substances
+d. Operation of critical infrastructure, transportation technologies, or heavy machinery
+e. Self-harm or harm to others, including suicide, cutting, and eating disorders
+f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+4.3. Intentionally deceive or mislead others, including use of Krutrim Community License related to the following:
+a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+c. Generating, promoting, or further distributing spam
+d. Impersonating another individual without consent, authorization, or legal right
+e. Representing that the use of Krutrim Community License  or outputs are human-generated
+f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+**5\. Restriction on Use to compete with Krutrim:**
+* The Software may not be used, directly or indirectly, by any person or any entity to develop, market, sell, or otherwise support products or services that compete with Krutrim’s core offerings.
+* This restriction applies to both internal development activities and any derivative works or products created based on the Software.
+* Violation of this clause will result in immediate termination of this license, along with potential legal action to seek damages and injunctive relief.
+**6\. Modification Rights:**
+* You may modify the Software for personal and research purposes. All modifications must clearly document the changes made, including the date of modification, nature of the changes, and the author responsible.
+* Modified versions must retain this license, including proper attribution to Krutrim. Documentation of modifications should accompany any distributed versions.
+* Modified versions may not be distributed for Commercial Use without a separate agreement. Any commercial distribution of modified versions requires prior written consent from Krutrim.
+### **7\. Distribution:**
+* Non-Commercial Distribution: Allowed for academic and research purposes with proper attribution and inclusion of this license. Redistribution in open-source repositories or educational platforms is encouraged, provided attribution is visible.
+* Commercial Distribution: Prohibited unless authorized under a separate commercial agreement. This includes distribution through commercial platforms, integration into proprietary products, or bundling with paid services.
+* Distribution Requirements: All distributed copies, whether modified or unmodified, must include a copy of this license, clear attribution to Krutrim, and a description of any modifications. Distributors must ensure recipients are aware of these license terms.
+### **8\. Patent Rights:** This license does not grant any express or implied patent rights. If your use of the Software involves activities that require patent rights, you must obtain a separate license from the respective patent holders. Krutrim makes no representations regarding third-party patent claims. Users are responsible for ensuring their use does not infringe on existing patents.
+### **9\. Warranty Disclaimer:**
+The Software is provided "AS IS," without warranty of any kind, express or implied. This includes, but is not limited to, warranties of merchantability, fitness for a particular purpose, non-infringement, or the absence of latent or other defects. You assume all risks associated with the use of the Software. Krutrim disclaims any responsibility for errors, bugs, or vulnerabilities that may arise.
+### **10\. Attribution Requirements:**
+* You must provide clear, visible attribution to "Krutrim" in any publication, presentation, distribution, or derivative work related to or connected with the Software. Attribution must appear in prominent locations such as documentation, user interfaces, academic papers, and software metadata.
+* The attribution should include the original project name, a link to the source repository or project homepage, and a statement acknowledging Krutrim’s contribution. Failure to provide proper attribution constitutes a breach of this license.
+**11**. **Cloud Deployment**:
+* **Permitted**: Deployment for academic, personal, and non-commercial research purposes is allowed, provided proper attribution to Krutrim is maintained. This includes hosting on private or public cloud platforms for non-commercial projects.
+* **Restricted**: Commercial cloud deployment, including SaaS offerings, for Commercial Use requires a separate commercial license agreement. This includes services that generate revenue through subscriptions, advertisements, or other commercial models.
+**12**. **Research Use**:
+* Permitted for academic, non-commercial research, provided proper attribution to Krutrim is included in any resulting publications, datasets, or presentations. Research may include data analysis, model training, simulations, and academic collaborations. Any research by a Licensee that is exploited commercially for economic gains would be construed as commercial use if it is used by more than the number of users identified under “Commercial Use”.
+* Research outputs, such as papers, models, or software, must cite Krutrim where the Software significantly contributed to the results. Failure to provide proper citation may result in license termination.
+### **13\. Limitation of Liability:**
+In no event shall Krutrim or its contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages. This includes, but is not limited to, loss of data, profits, business interruptions, or any other commercial damages or losses, arising out of or in connection with the use or inability to use the Software, even if advised of the possibility of such damages. Users assume full responsibility for their reliance on the Software.
+### **14\. Sub-licensing:**
+Sub-licensing is strictly prohibited. You may not grant, assign, or otherwise transfer any rights under this license to third parties. You may not impose additional restrictions on the Software beyond those specified in this license. Any attempt to sub-license will render this license null and void.
+### **15\. Derivative Works and Model Training:**
+* **Permitted**: You may train derivative models or develop derivative works for non-commercial, academic, or research purposes, provided you comply with attribution requirements. Derivative works should be clearly identified as modifications of the original Software.
+* **Restricted**: Any Commercial Use of derivative models, including monetization, deployment in commercial products, or offering as a service, requires explicit written permission from Krutrim. This includes using trained models in SaaS platforms, APIs, or enterprise software solutions.
+### **16\. Compliance and Audits:**
+Krutrim reserves the right to request proof of compliance with this license. This may include, but is not limited to, documentation of modifications, usage logs, and details of distribution. Failure to provide such documentation may result in license termination. Krutrim may conduct audits to ensure compliance, with reasonable notice provided to the Licensee.
+### **17\. Termination:**
+* This license will terminate automatically if you fail to comply with any of its terms. Violations may include unauthorized commercial use, failure to provide attribution, or non-compliance with distribution requirements.
+* Upon termination, you must immediately cease all use, modification, and distribution of the Software and destroy all copies in your possession or control.
+* Termination does not relieve you of obligations accrued prior to termination, including any liabilities for breaches that occurred before termination.
+### **18\. Governing Law and Dispute Resolution:**
+* This Agreement shall be governed by and construed in accordance with the laws of India, without regard to its conflict of law principles.
+* Any disputes arising from this Agreement shall be subject to arbitration under the Indian Arbitration and Conciliation Act, 1996\. The arbitration will be conducted in English, with the venue in Bangalore, India. The decision of the arbitrator shall be final and binding on all parties.
+### **19\. Severability:**
+If any provision of this license is found to be invalid, illegal, or unenforceable, the remaining provisions shall continue in full force and effect. Any invalid provision shall be replaced with a valid one that comes closest to the original intent, ensuring the overall purpose of the license is preserved.
+### **20\. Entire Agreement:**
+This license constitutes the complete and exclusive agreement between you and Krutrim concerning the Software. It supersedes any prior or contemporaneous agreements, communications, or understandings, whether written or oral. Any modifications to this license must be in writing and signed by both parties.
+By using the Software, you acknowledge that you have read, understood, and agree to be bound by the terms of this license.# **Krutrim Community License Agreement Version 1.0**
+### **1\. Definitions:**
+* "**Software**" refers to the code, documentation, models, APIs, libraries, scripts, and any associated materials provided under this license, relating to Krutrim Community License including updates, modifications, enhancements, and derivative works.
+* "**You**" or "**Licensee**" refers to the individual, organization, or legal entity exercising rights under this license, including its employees, contractors, and affiliates.
+* "**Modification**" means any alteration to the Software, including but not limited to changes, improvements, enhancements, translations, adaptations, or derivative works based on the original Software.
+* "**Commercial** **Use**" means any use of the Software intended for commercial advantage or monetary compensation where more than 1 million monthly active users of the Licensee use the Software.
+* "**Distribution**" refers to the act of making the Software available to third parties through any means, including but not limited to physical media, downloads, or cloud-based platforms.
+### **2\. Grant of License:**
+Subject to the terms and conditions of this Agreement, Krutrim grants you a worldwide, non-exclusive, non-transferable, revocable limited license to:
+* Use the Software for permitted purposes as outlined in this license, including research, academic, and personal projects.
+* Modify the Software for research and personal use, provided that all changes are clearly documented, and proper attribution is maintained.
+* Distribute the Software under specified conditions, ensuring compliance with attribution, modification transparency, and adherence to non-commercial restrictions where applicable.
+This license does not constitute a sale of the Software and does not grant ownership rights. Krutrim retains all intellectual property rights not explicitly granted herein.
+### **3\. Permitted Uses:**
+* Research and Personal Use: Free to use, modify, and distribute for academic, educational, research, and personal purposes, provided proper attribution to Krutrim is included. This includes use in scientific studies, data analysis, and personal development projects.
+* Educational Use: Permitted in teaching, training, academic projects, and coursework without commercial exploitation. Use in online courses, educational programs, and classroom environments is encouraged, provided attribution is maintained.
+* Commercial Use: Commercial Use is permitted only through a separate commercial license agreement with Krutrim, which Krutrim may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Krutrim otherwise expressly grants you such rights.
+**4\. Prohibited Use:** You agree you will not use, or allow others to use, Krutrim Community License to:
+4.1. Violate the law or others’ rights, including to:
+a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+i. Violence or terrorism
+ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+iii. Human trafficking, exploitation, and sexual violence
+iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+v. Sexual solicitation
+vi. Any other criminal activity
+b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Krutrim Community License or Software therein
+g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+4.2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Krutrim Community License related to the following:
+a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are prohibited under applicable laws
+b. Guns and illegal weapons (including weapon development)
+c. Illegal drugs and regulated/controlled substances
+d. Operation of critical infrastructure, transportation technologies, or heavy machinery
+e. Self-harm or harm to others, including suicide, cutting, and eating disorders
+f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+4.3. Intentionally deceive or mislead others, including use of Krutrim Community License related to the following:
+a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+c. Generating, promoting, or further distributing spam
+d. Impersonating another individual without consent, authorization, or legal right
+e. Representing that the use of Krutrim Community License  or outputs are human-generated
+f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+**5\. Restriction on Use to compete with Krutrim:**
+* The Software may not be used, directly or indirectly, by any person or any entity to develop, market, sell, or otherwise support products or services that compete with Krutrim’s core offerings.
+* This restriction applies to both internal development activities and any derivative works or products created based on the Software.
+* Violation of this clause will result in immediate termination of this license, along with potential legal action to seek damages and injunctive relief.
+**6\. Modification Rights:**
+* You may modify the Software for personal and research purposes. All modifications must clearly document the changes made, including the date of modification, nature of the changes, and the author responsible.
+* Modified versions must retain this license, including proper attribution to Krutrim. Documentation of modifications should accompany any distributed versions.
+* Modified versions may not be distributed for Commercial Use without a separate agreement. Any commercial distribution of modified versions requires prior written consent from Krutrim.
+### **7\. Distribution:**
+* Non-Commercial Distribution: Allowed for academic and research purposes with proper attribution and inclusion of this license. Redistribution in open-source repositories or educational platforms is encouraged, provided attribution is visible.
+* Commercial Distribution: Prohibited unless authorized under a separate commercial agreement. This includes distribution through commercial platforms, integration into proprietary products, or bundling with paid services.
+* Distribution Requirements: All distributed copies, whether modified or unmodified, must include a copy of this license, clear attribution to Krutrim, and a description of any modifications. Distributors must ensure recipients are aware of these license terms.
+### **8\. Patent Rights:** This license does not grant any express or implied patent rights. If your use of the Software involves activities that require patent rights, you must obtain a separate license from the respective patent holders. Krutrim makes no representations regarding third-party patent claims. Users are responsible for ensuring their use does not infringe on existing patents.
+### **9\. Warranty Disclaimer:**
+The Software is provided "AS IS," without warranty of any kind, express or implied. This includes, but is not limited to, warranties of merchantability, fitness for a particular purpose, non-infringement, or the absence of latent or other defects. You assume all risks associated with the use of the Software. Krutrim disclaims any responsibility for errors, bugs, or vulnerabilities that may arise.
+### **10\. Attribution Requirements:**
+* You must provide clear, visible attribution to "Krutrim" in any publication, presentation, distribution, or derivative work related to or connected with the Software. Attribution must appear in prominent locations such as documentation, user interfaces, academic papers, and software metadata.
+* The attribution should include the original project name, a link to the source repository or project homepage, and a statement acknowledging Krutrim’s contribution. Failure to provide proper attribution constitutes a breach of this license.
+**11**. **Cloud Deployment**:
+* **Permitted**: Deployment for academic, personal, and non-commercial research purposes is allowed, provided proper attribution to Krutrim is maintained. This includes hosting on private or public cloud platforms for non-commercial projects.
+* **Restricted**: Commercial cloud deployment, including SaaS offerings, for Commercial Use requires a separate commercial license agreement. This includes services that generate revenue through subscriptions, advertisements, or other commercial models.
+**12**. **Research Use**:
+* Permitted for academic, non-commercial research, provided proper attribution to Krutrim is included in any resulting publications, datasets, or presentations. Research may include data analysis, model training, simulations, and academic collaborations. Any research by a Licensee that is exploited commercially for economic gains would be construed as commercial use if it is used by more than the number of users identified under “Commercial Use”.
+* Research outputs, such as papers, models, or software, must cite Krutrim where the Software significantly contributed to the results. Failure to provide proper citation may result in license termination.
+### **13\. Limitation of Liability:**
+In no event shall Krutrim or its contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages. This includes, but is not limited to, loss of data, profits, business interruptions, or any other commercial damages or losses, arising out of or in connection with the use or inability to use the Software, even if advised of the possibility of such damages. Users assume full responsibility for their reliance on the Software.
+### **14\. Sub-licensing:**
+Sub-licensing is strictly prohibited. You may not grant, assign, or otherwise transfer any rights under this license to third parties. You may not impose additional restrictions on the Software beyond those specified in this license. Any attempt to sub-license will render this license null and void.
+### **15\. Derivative Works and Model Training:**
+* **Permitted**: You may train derivative models or develop derivative works for non-commercial, academic, or research purposes, provided you comply with attribution requirements. Derivative works should be clearly identified as modifications of the original Software.
+* **Restricted**: Any Commercial Use of derivative models, including monetization, deployment in commercial products, or offering as a service, requires explicit written permission from Krutrim. This includes using trained models in SaaS platforms, APIs, or enterprise software solutions.
+### **16\. Compliance and Audits:**
+Krutrim reserves the right to request proof of compliance with this license. This may include, but is not limited to, documentation of modifications, usage logs, and details of distribution. Failure to provide such documentation may result in license termination. Krutrim may conduct audits to ensure compliance, with reasonable notice provided to the Licensee.
+### **17\. Termination:**
+* This license will terminate automatically if you fail to comply with any of its terms. Violations may include unauthorized commercial use, failure to provide attribution, or non-compliance with distribution requirements.
+* Upon termination, you must immediately cease all use, modification, and distribution of the Software and destroy all copies in your possession or control.
+* Termination does not relieve you of obligations accrued prior to termination, including any liabilities for breaches that occurred before termination.
+### **18\. Governing Law and Dispute Resolution:**
+* This Agreement shall be governed by and construed in accordance with the laws of India, without regard to its conflict of law principles.
+* Any disputes arising from this Agreement shall be subject to arbitration under the Indian Arbitration and Conciliation Act, 1996\. The arbitration will be conducted in English, with the venue in Bangalore, India. The decision of the arbitrator shall be final and binding on all parties.
+### **19\. Severability:**
+If any provision of this license is found to be invalid, illegal, or unenforceable, the remaining provisions shall continue in full force and effect. Any invalid provision shall be replaced with a valid one that comes closest to the original intent, ensuring the overall purpose of the license is preserved.
+### **20\. Entire Agreement:**
+This license constitutes the complete and exclusive agreement between you and Krutrim concerning the Software. It supersedes any prior or contemporaneous agreements, communications, or understandings, whether written or oral. Any modifications to this license must be in writing and signed by both parties.
+By using the Software, you acknowledge that you have read, understood, and agree to be bound by the terms of this license.

README.md CHANGED Viewed

@@ -1,3 +1,96 @@
----
-license: apache-2.0
----

+---
+license: other
+license_name: krutrim-community-license-agreement-version-1.0
+license_link: LICENSE.md
+language:
+- hi
+- bn
+- ta
+- te
+- gu
+- or
+- en
+- as
+- ml
+- mr
+- kn
+---
+# Chitrarth: Bridging Vision and Language for a Billion People
+[![Static Badge](https://img.shields.io/badge/Huggingface-Chitrarth-yellow?logo=huggingface)](https://huggingface.co/krutrim-ai-labs/chitrarth)	[![Static Badge](https://img.shields.io/badge/Github-Chitrarth-green?logo=github)](https://github.com/ola-krutrim/Chitrarth)	[![Static Badge](https://img.shields.io/badge/Krutrim_Cloud-Chitrarth-orange?logo=data:image/png%2bxml;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAADpUlEQVRYCbVXTUhbQRDeRJqi2JSS1BQtgpCa0kiPehdNi6dWbfWgF0s9eGtPFSFG8VDMpSiCB28KQou0NwsS9NA/Dx4qNP1TUIqSmlKSFjQx4vabbXbJz8vLe2kz8GX3zc7MN2/2J/sszLichekN4A7gBZxpcLQ/0gijfQq8BFLAf5ELiBIEfgNEZgSxtA/5liw2eD4EfgJGSLVsyPcBQLFMiR3WIUAraCm6F4hFMQ2JB1afgFKI9Hw+IubVYhnQwvpSBnKZ2GfEvlgoiTMYeFNGcpnEK3AQV548gkYalbslLiGWdEtl2QbOpZ9FMzg4yGprazNVpvrr6+tseXlZy+cXlFeAAzk4i07eW29sbPB/kampqbyYGTzEyagC5wHKJG+v6lWgqamJdXV1wY2xhYUFtr1NBcwWnQqQYRJwUQK3gOeArjidTkakJMfHx6y+vp4tLi6KZ5/Px1ZWVkTf5M9tstcsP/SifFarlQcCAX50dKRm4/T0lPf19ann9vZ2Xl1dzZubm3lVVZVe2XPHxDS8k2Ra7fj4uCKSnUgkwnt7e+Uj393d5ZQUSSqV4sFgMJeo0DNxsx0tYtLR2x8eHorA4XCY19TUqECZCZAB1gDf398XtvTT0dGhbAvFh37Hip9LgKbYbDZWWVkpxtbW1tjBgdo1rKGhQegTiQQbHR1lbreb9fT0qDgtLS2qr9MR3AkYFMyW3pwkGo3yzs5OPjAwwFdXV4WOfra2tpSv3W5X+snJSaXXiU/chaeAHLu7u1VQrQ6VXhJgWyqT/v5+pZfjGu0OdEx3EZJTW1sbX1pa4pgGgZmZGT40NCTIMisgDy5MC3c4HEYSEItwlkjMQi7Cvb095etyufjc3ByfmJhQuiJxiVscREYdlN3w8DA/OTnhsVhM6YqQadndpAToKNZdiLmBvV4vTyaTYgo2Nze5xWLRCl5MR0exOv5NTcPY2Jiaf2zTYkSFxkX56RwgCQBUBUNSUVEh7OicoP3e2trKpqenGf1fGBTi8ufaPoGiULZZ+sbGRh6Px9WWk52RkZEsO514j3PJ6Zlure8BQ0E8Hg+fn58X2zIUCnG/38/r6uqM+L4Fx9/jFZ1cuQzFN8BIoFJsviJ20Xm6DqN4GZKIIqYbMCQOWL0GSnlLLR+6rVBMU0I75B4QAbSCGtF9h+99QO42dM0L3ZRp1Zr9OCWfrFu2FrW8lmuN5erOQuED7gLXAPl5TjHk5/kH9J8BdBc39Hn+BxqB1clokCTRAAAAAElFTkSuQmCC)](https://cloud.olakrutrim.com/console/inference-service?section=models&modelName=Krutrim&artifactName=chitrarth&artifactType=model)	[![Static Badge](https://img.shields.io/badge/Krutrim_AI_Labs-Chitrarth-blue?logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iMzYiIGhlaWdodD0iMzYiIHZpZXdCb3g9IjAgMCAzNiAzNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHJlY3Qgd2lkdGg9IjM2IiBoZWlnaHQ9IjM2IiByeD0iMTgiIGZpbGw9IiMxMEE1NTQiLz4KPHBhdGggZD0iTTI2LjQxNCAxMi41OTE5SDE5LjMzVjE1LjY0OTlDMjAuMDM0IDE1LjIzOTIgMjAuODQwNyAxNS4wMzM5IDIxLjc1IDE1LjAzMzlDMjIuNzkxMyAxNS4wMzM5IDIzLjY0MiAxNS4zNTY1IDI0LjMwMiAxNi4wMDE5QzI0Ljk3NjcgMTYuNjQ3MiAyNS4zMTQgMTcuNTQxOSAyNS4zMTQgMTguNjg1OUMyNS4zMTQgMTkuMzMxMiAyNS4xODkzIDIwLjA0OTkgMjQuOTQgMjAuODQxOUMyNC43MDUzIDIxLjYzMzkgMjQuMzE2NyAyMi40NDA1IDIzLjc3NCAyMy4yNjE5TDIxLjIgMjEuODMxOUMyMS41MzczIDIxLjM3NzIgMjEuODE2IDIwLjkwNzkgMjIuMDM2IDIwLjQyMzlDMjIuMjU2IDE5LjkzOTkgMjIuMzY2IDE5LjQ0MTIgMjIuMzY2IDE4LjkyNzlDMjIuMzY2IDE4LjM4NTIgMjIuMjQ4NyAxOC4wMDM5IDIyLjAxNCAxNy43ODM5QzIxLjc5NCAxNy41NjM5IDIxLjUwMDcgMTcuNDUzOSAyMS4xMzQgMTcuNDUzOUMyMC43OTY3IDE3LjQ1MzkgMjAuMTQ0IDE3Ljc2MTkgMjAuMTQ0IDE3Ljc2MTlDMjAuMTQ0IDE3Ljc2MTkgMTkuMTE0NyAxOC4xODcyIDE4Ljg4IDE4LjQyMTlWMjMuODU1OUgxNi4zODJWMjEuMDYxOUMxNS44OTggMjEuMzQwNSAxNS40MDY3IDIxLjU1MzIgMTQuOTA4IDIxLjY5OTlDMTQuNDI0IDIxLjg0NjUgMTMuODU5MyAyMS45MTk5IDEzLjIxNCAyMS45MTk5QzEyLjQwNzMgMjEuOTE5OSAxMS42NjY3IDIxLjc3MzIgMTAuOTkyIDIxLjQ3OTlDMTAuMzMyIDIxLjE3MTkgOS44MDQgMjAuNzI0NSA5LjQwOCAyMC4xMzc5QzkuMDEyIDE5LjU1MTIgOC44MTQgMTguODE3OSA4LjgxNCAxNy45Mzc5QzguODE0IDE3LjExNjUgOS4wMTIgMTYuNDEyNSA5LjQwOCAxNS44MjU5QzkuODA0IDE1LjIyNDUgMTAuMzU0IDE0Ljc2MjUgMTEuMDU4IDE0LjQzOTlDMTEuNzYyIDE0LjEwMjUgMTIuNTc2IDEzLjkzMzkgMTMuNSAxMy45MzM5QzEzLjkxMDcgMTMuOTMzOSAxNC4zMjEzIDEzLjk0ODUgMTQuNzMyIDEzLjk3NzlDMTUuMTU3MyAxNC4wMDcyIDE1LjQ4NzMgMTQuMDU4NSAxNS43MjIgMTQuMTMxOUwxNS41MDIgMTYuNTczOUMxNS4wMzI3IDE2LjQ1NjUgMTQuNTEyIDE2LjM5NzkgMTMuOTQgMTYuMzk3OUMxMy4yNTA3IDE2LjM5NzkgMTIuNzE1MyAxNi41MzcyIDEyLjMzNCAxNi44MTU5QzExLjk1MjcgMTcuMDc5OSAxMS43NjIgMTcuNDUzOSAxMS43NjIgMTcuOTM3OUMxMS43NjIgMTguNTI0NSAxMS45NDUzIDE4LjkyNzkgMTIuMzEyIDE5LjE0NzlDMTIuNjc4NyAxOS4zNjc5IDEzLjA3NDcgMTkuNDc3OSAxMy41IDE5LjQ3NzlDMTQuMTE2IDE5LjQ3NzkgMTQuNjU4NyAxOS4zMzg1IDE1LjEyOCAxOS4wNTk5QzE1LjYxMiAxOC43ODEyIDE2LjAzIDE4LjQ1ODUgMTYuMzgyIDE4LjA5MTlWMTIuNTkxOUg4VjEwLjE3MTlIMjYuNDE0VjEyLjU5MTlaIiBmaWxsPSJ3aGl0ZSIvPgo8cGF0aCBkPSJNMjIuMDc0IDI4Ljk4MTlDMjEuNjkyNyAyOS4xNzI1IDIxLjIzOCAyOS4zNDg1IDIwLjcxIDI5LjUwOTlDMjAuMTY3MyAyOS42NzEyIDE5LjUyMiAyOS43NTE5IDE4Ljc3NCAyOS43NTE5QzE4LjA0MDcgMjkuNzUxOSAxNy4zODggMjkuNjEyNSAxNi44MTYgMjkuMzMzOUMxNi4yNDQgMjkuMDY5OSAxNS43OTY3IDI4LjY5NTkgMTUuNDc0IDI4LjIxMTlDMTUuMTM2NyAyNy43NDI1IDE0Ljk2OCAyNy4xOTI1IDE0Ljk2OCAyNi41NjE5QzE0Ljk2OCAyNS41MDU5IDE1LjM0MiAyNC42NjI1IDE2LjA5IDI0LjAzMTlDMTYuODIzMyAyMy40MTU5IDE3LjQyOTMgMjMuMDYzOSAxOC44MDggMjIuOTc1OUwxOS4wNzIgMjUuMjQxOUMxOC4zMjQgMjUuMjg1OSAxOC4yNjA3IDI1LjQyNTIgMTcuOTgyIDI1LjY1OTlDMTcuNzAzMyAyNS45MDkyIDE3LjU2NCAyNi4xOTUyIDE3LjU2NCAyNi41MTc5QzE3LjU2NCAyNy4xOTI1IDE4LjAxMTMgMjcuNTI5OSAxOC45MDYgMjcuNTI5OUMxOS4yNDMzIDI3LjUyOTkgMTkuNTg4IDI3LjQ3ODUgMTkuOTQgMjcuMzc1OUMyMC4yOTIgMjcuMjczMiAyMC43MTczIDI3LjA5NzIgMjEuMjE2IDI2Ljg0NzlMMjIuMDc0IDI4Ljk4MTlaIiBmaWxsPSJ3aGl0ZSIvPgo8L3N2Zz4K)](https://ai-labs.olakrutrim.com/models/Chitrarth-1)
+## 1. Introduction
+Chitrarth (Chitra: Image; Artha: Meaning) is a multilingual VLM that integrates a state-of-the-art multilingual Large Language Model (LLM) with a vision module. This model is trained primarily on multilingual image-text data and is designed to work across 10 prominent Indian languages, including Hindi, Bengali, Telugu, Tamil, Marathi, Gujarati, Kannada, Malayalam, Odia, and Assamese, as well as English
+[![Chitrarth](https://img.youtube.com/vi/TmzEweLIgsc/0.jpg)](https://www.youtube.com/watch?v=TmzEweLIgsc)
+## 2. Model Summary
+### Key Features
+- **Model:** Krutrim-1 as the base LLM, SigLIP as the visual encoder with 2 layer MLP
+- **Languages Supported:** 10 Indic languages - Hindi, Bengali, Telugu, Tamil, Marathi, Gujarati, Kannada, Malayalam, Odia, and Assamese, as well as English
+- **Usage:** General purpose VLM
+![model](assets/model.png)
+## 3. API Platform
+Visit [Chitrarth Online](https://cloud.olakrutrim.com/console/inference-service?section=models&modelName=Krutrim&artifactName=chitrarth&artifactType=model) to access the model via the web interface.
+## 4. Inference code
+```
+git clone https://github.com/ola-krutrim/Chitrarth.git
+conda create --name chitrarth python=3.10
+conda activate chitrarth
+cd Chitrarth
+pip install -e .
+python chitrarth/inference.py --model-path "krutrim-ai-labs/chitrarth" --image-file "assets/govt_school.jpeg" --query "Explain the image. "
+```
+## 5. Evaluation Results
+![model](assets/radar.png)
+Performance against SOTA VLMs on different academic multimodal tasks. Our model consistently outperforms IDEFICS 2 (7B) and PALO 7B on different benchmarks while remaining competitive on TextVQA and Vizwiz.
+We introduce **BharatBench**, a comprehensive evaluation benchmark suite designed for **10 under-resourced Indic languages** across **3 tasks**. The performance of **Chitrarth** on the BharatBench Evaluation framework sets a strong baseline for future research in this domain. Our model is unique in its ability to handle all included languages.
+Below are the performance results of **Chitrarth** on BharatBench across three evaluation tasks: **POPE**, **LLaVA-Bench**, and **MMVet**.
+| **Language**   | **POPE** | **LLaVA-Bench** | **MMVet** |
+|----------------|----------|-----------------|-----------|
+| **Telugu**     | 79.9     | 54.8            | 43.76     |
+| **Hindi**      | 78.68    | 51.5            | 38.85     |
+| **Bengali**    | 83.24    | 53.7            | 33.24     |
+| **Malayalam**  | 85.29    | 55.5            | 25.36     |
+| **Kannada**    | 85.52    | 58.1            | 46.19     |
+| **Assamese**   | 55.59    | 59.1            | 37.29     |
+| **Tamil**      | 83.28    | 58.3            | 34.31     |
+| **Marathi**    | 79.17    | 52.8            | 40.96     |
+| **Gujarati**   | 84.75    | 55.9            | 39.03     |
+| **Odia**       | 82.03    | 62.8            | 19.67     |
+| **English**    | 87.63    | 67.9            | 30.49     |
+## 6. License
+This code repository and the model weights are licensed under the [Krutrim Community License.](LICENSE.md)
+## 7. Citation
+```
+@inproceedings{
+  khan2024chitrarth,
+  title={Chitrarth: Bridging Vision and Language for a Billion People},
+  author={Shaharukh Khan, Ayush Tarun, Abhinav Ravi, Ali Faraz, Praveen Kumar Pokala, Anagha Bhangare, Raja Kolla, Chandra Khatri, Shubham Agarwal},
+  booktitle={NeurIPS Multimodal Algorithmic Reasoning},
+  year={2024},
+}
+```
+## 8. Contact
+Contributions are welcome! If you have any improvements or suggestions, feel free to submit a pull request on GitHub.

attention.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Attention layers."""
+import math
+import warnings
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+import transformers
+from einops import rearrange
+from packaging import version
+from torch import nn
+from .fc import FC_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
+def is_flash_v2_installed(v2_version: str='2.0.0'):
+    assert version.parse(v2_version) >= version.parse('2.0.0')
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) >= version.parse(v2_version)
+def is_flash_v1_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) < version.parse('2.0.0')
+def is_transformers_version_gte(hf_version: str) -> bool:
+    return version.parse(transformers.__version__) >= version.parse(hf_version)
+def check_alibi_support(attention_impl: str) -> bool:
+    return attention_impl != 'flash' or is_flash_v2_installed(v2_version='v2.4.2')
+if is_flash_v1_installed():
+    import transformers
+    transformers.utils.is_flash_attn_available = lambda: False
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool) -> bool:
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
+        else:
+            return False
+    return original_is_causal
+def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Perform repeat of kv heads along a particular dimension.
+    hidden.shape expected to be: (batch size, seq len, kv_n_heads, head_dim)
+    n_rep: amount of repetitions of kv_n_heads
+    Unlike torch.repeat_interleave, this function avoids allocating new memory.
+    """
+    if n_rep == 1:
+        return hidden
+    b, s, kv_n_heads, d = hidden.shape
+    hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
+    return hidden.reshape(b, s, kv_n_heads * n_rep, d)
+def scaled_multihead_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
+    b, _, s_q, d = q.shape
+    s_k = k.size(-1)
+    if kv_n_heads > 1 and kv_n_heads < n_heads:
+        k = repeat_kv_for_gqa(k.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
+        v = repeat_kv_for_gqa(v.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
+            raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
+        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return (out, attn_weight, past_key_value)
+    return (out, None, past_key_value)
+def check_valid_inputs(*tensors: torch.Tensor, valid_dtypes: Optional[list[torch.dtype]]=None):
+    if valid_dtypes is None:
+        valid_dtypes = [torch.float16, torch.bfloat16]
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
+def flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False, multiquery: bool=False, should_repeat_kv_for_gqa: Optional[bool]=True, sliding_window_size: int=-1, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    if key_padding_mask is not None:
+        raise ValueError('key_padding_mask should be None for flash attn.')
+    del key_padding_mask
+    if flash_attn_padding_info is None:
+        raise ValueError('flash_attn_padding_info is required for flash attn.')
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except:
+        raise RuntimeError('Please install flash-attn==1.0.9 or flash-attn==2.3.6')
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
+    indices_q = flash_attn_padding_info['indices_q']
+    indices_k = flash_attn_padding_info['indices_k']
+    indices_v = flash_attn_padding_info['indices_v']
+    cu_seqlens_q = flash_attn_padding_info['cu_seqlens_q']
+    cu_seqlens_k = flash_attn_padding_info['cu_seqlens_k']
+    max_seqlen_q = flash_attn_padding_info['max_seqlen_q']
+    max_seqlen_k = flash_attn_padding_info['max_seqlen_k']
+    query_unpad = bert_padding.index_first_axis(rearrange(query, 'b s ... -> (b s) ...'), indices_q)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    key_unpad = bert_padding.index_first_axis(rearrange(key, 'b s ... -> (b s) ...'), indices_k)
+    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    value_unpad = bert_padding.index_first_axis(rearrange(value, 'b s ... -> (b s) ...'), indices_v)
+    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+    if kv_n_heads < n_heads and (not is_flash_v2_installed()) and (not should_repeat_kv_for_gqa):
+        raise ValueError('For Grouped Query Attention or Multi Query Attention, should_repeat_kv_for_gqa should be set to True if not using Flash Attention v2.')
+    if should_repeat_kv_for_gqa:
+        if kv_n_heads == 1:
+            key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
+            value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
+        elif kv_n_heads < n_heads:
+            key_unpad = repeat_kv_for_gqa(key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
+            value_unpad = repeat_kv_for_gqa(value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1), n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    if is_flash_v1_installed():
+        output_unpad = flash_attn_interface.flash_attn_unpadded_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
+    elif is_flash_v2_installed():
+        alibi_kwargs = {}
+        if check_alibi_support('flash'):
+            alibi_kwargs = {'alibi_slopes': alibi_slopes}
+        elif alibi_slopes is not None:
+            raise ValueError('alibi_slopes is only supported for flash-attn>=2.4.2')
+        output_unpad = flash_attn_interface.flash_attn_varlen_func(q=query_unpad, k=key_unpad, v=value_unpad, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, dropout_p=dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights, window_size=(sliding_window_size, sliding_window_size), **alibi_kwargs)
+    else:
+        raise RuntimeError('flash-attn==1.0.9 or flash-attn==2.4.2 is required.')
+    output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return (output, None, past_key_value)
+def triton_flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    try:
+        from .flash_attn_triton import flash_attn_func
+    except:
+        _installed = False
+        if version.parse(torch.__version__) < version.parse('2.0.0'):
+            _installed = True
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except:
+                _installed = False
+        if not _installed:
+            raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU ' + 'and `pip install .[gpu]` if installing from llm-foundry source or ' + '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` ' + 'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). ' + 'Note: (1) requires you have CMake and PyTorch already installed.')
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if dropout_p:
+        raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
+    dropout_p = dropout_p if training else 0.0
+    if needs_weights:
+        raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
+    if key_padding_mask is not None:
+        warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
+        b_size, s_k = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
+    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
+    key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads)
+    value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads)
+    if kv_n_heads == 1:
+        key = key.repeat(1, 1, n_heads, 1)
+        value = value.repeat(1, 1, n_heads, 1)
+    elif kv_n_heads < n_heads:
+        key = repeat_kv_for_gqa(key, n_heads // kv_n_heads)
+        value = repeat_kv_for_gqa(value, n_heads // kv_n_heads)
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return (output, None, past_key_value)
+class GroupedQueryAttention(nn.Module):
+    """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
+    and Multi-query attention (MQA).
+    This allows the user to set a variable of number of kv_n_heads, rather than
+    just n_heads or 1, as in MHA and MQA. Using torch or triton attention
+    implementation enables user to also use additive bias.
+    """
+    def __init__(self, d_model: int, n_heads: int, kv_n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.qk_gn = qk_gn
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.kv_n_heads = kv_n_heads
+        self.sliding_window_size = sliding_window_size
+        self.head_dim = d_model // n_heads
+        if self.kv_n_heads <= 0:
+            raise ValueError('kv_n_heads should be greater than zero.')
+        if self.kv_n_heads > self.n_heads:
+            raise ValueError('The number of KV heads should be less than or equal to Q heads.')
+        if self.n_heads % self.kv_n_heads != 0:
+            raise ValueError('Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.')
+        if qk_ln and qk_gn:
+            raise ValueError('Only one of qk_ln and qk_gn can be set to True.')
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        fc_kwargs: dict[str, Any] = {'bias': bias}
+        fc_kwargs['device'] = device
+        self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
+        fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln or self.qk_gn:
+            norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+            norm_size = self.head_dim if qk_gn else d_model
+            self.q_ln = norm_class(norm_size, device=device)
+            if qk_ln:
+                norm_size = self.head_dim * kv_n_heads
+            self.k_ln = norm_class(norm_size, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+        else:
+            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+        self.out_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
+        self.out_proj._is_residual = True
+    def forward(self, x: torch.Tensor, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[dict]=None, is_causal: bool=True, needs_weights: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln or self.qk_gn:
+            q_shape, k_shape = (query.shape, key.shape)
+            if self.qk_gn:
+                b, s = query.shape[:2]
+                query = query.view(b, s, self.n_heads, -1)
+                key = key.view(b, s, self.kv_n_heads, -1)
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype).view(q_shape)
+            key = self.k_ln(key).to(dtype).view(k_shape)
+        if rotary_emb_w_meta_info is not None:
+            rotary_emb = rotary_emb_w_meta_info['rotary_emb']
+            seq_len = rotary_emb_w_meta_info['seq_len']
+            offset_info = rotary_emb_w_meta_info['offset_info']
+            bsz, seqlen = query.shape[:2]
+            query = query.view(bsz, seqlen, -1, self.head_dim)
+            key = key.view(bsz, seqlen, -1, self.head_dim)
+            if rotary_emb_w_meta_info['impl'] == 'dail':
+                value = value.view(bsz, seqlen, -1, self.head_dim)
+                kv = torch.stack([key, value], dim=2)
+                query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len)
+                [key, value] = torch.unbind(kv, dim=2)
+                value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+            elif rotary_emb_w_meta_info['impl'] == 'hf':
+                cos, sin = rotary_emb(value, seq_len)
+                if is_transformers_version_gte('4.36'):
+                    query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2)
+                else:
+                    query = query.transpose(1, 2)
+                    key = key.transpose(1, 2)
+                    query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info)
+                    query = query.transpose(1, 2)
+                    key = key.transpose(1, 2)
+            query = query.view(bsz, seqlen, self.d_model)
+            key = key.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+        extra_attn_kwargs = {}
+        if self.attn_impl == 'flash':
+            key_padding_mask = None
+            extra_attn_kwargs = {'should_repeat_kv_for_gqa': not is_flash_v2_installed(), 'sliding_window_size': self.sliding_window_size, 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info}
+        context, attn_weights, past_key_value = self.attn_fn(query, key, value, self.n_heads, self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, **extra_attn_kwargs)
+        return (self.out_proj(context), attn_weights, past_key_value)
+class MultiheadAttention(GroupedQueryAttention):
+    """Multi-head self attention.
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
+        super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=n_heads, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
+class MultiQueryAttention(GroupedQueryAttention):
+    """Multi-Query self attention.
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
+        super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=1, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
+def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, prefix_lm: bool, causal: bool, use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+def build_attn_bias(attn_impl: str, attn_bias: torch.Tensor, n_heads: int, seq_len: int, causal: bool=False, alibi: bool=False, alibi_bias_max: int=8) -> Optional[torch.Tensor]:
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            device, dtype = (attn_bias.device, attn_bias.dtype)
+            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
+        return attn_bias
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+def gen_slopes(n_heads: int, alibi_bias_max: int=8, device: Optional[torch.device]=None, return_1d: bool=False) -> torch.Tensor:
+    _n_heads = 2 ** math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = 1.0 / torch.pow(2, m)
+    if _n_heads != n_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    if return_1d:
+        return slopes
+    return slopes.view(1, n_heads, 1, 1)
+def build_alibi_bias(n_heads: int, seq_len: int, full: bool=False, alibi_bias_max: int=8, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None) -> torch.Tensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
+    if full:
+        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention, 'grouped_query_attention': GroupedQueryAttention}

blocks.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""GPT Blocks used for the GPT Model."""
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY
+from .ffn import FFN_CLASS_REGISTRY, build_ffn
+from .norm import NORM_CLASS_REGISTRY
+try:
+    from flash_attn.bert_padding import unpad_input, pad_input
+except:
+    unpad_input, pad_input = (None, None)
+attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'qk_gn': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, 'rope_theta': 10000, 'rope_impl': 'dail', 'rope_dail_config': {'type': 'original', 'pos_idx_in_fp32': True, 'xpos_scale_base': 512}, 'rope_hf_config': {'type': 'no_scaling', 'factor': 1.0}}
+class MPTBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Optional[Dict]=None, ffn_config: Optional[Dict]=None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, no_bias: bool=False, use_pad_tok_in_ffn: bool=True, **kwargs: Any):
+        if attn_config is None:
+            attn_config = attn_config_defaults
+        if ffn_config is None:
+            ffn_config = {'ffn_type': 'mptmlp'}
+        del kwargs
+        super().__init__()
+        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+        assert isinstance(attn_config['attn_type'], str)
+        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
+        args_to_exclude_in_attn_class = {'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id', 'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl', 'rope_dail_config', 'rope_hf_config'}
+        attn_config_subset_for_attn_class = {k: v for k, v in attn_config.items() if k not in args_to_exclude_in_attn_class}
+        self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(d_model=d_model, n_heads=n_heads, fc_type=fc_type, device=device, **attn_config_subset_for_attn_class, bias=not no_bias)
+        self.norm_2 = None
+        if not getattr(FFN_CLASS_REGISTRY[ffn_config['ffn_type']], '_has_norm', False):
+            self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = build_ffn(d_model=d_model, expansion_ratio=expansion_ratio, device=device, bias=not no_bias, **ffn_config)
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+        self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
+    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[Dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        a = self.norm_1(x)
+        b, attn_weights, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
+        x = x + self.resid_attn_dropout(b)
+        m = x
+        if self.norm_2 is not None:
+            m = self.norm_2(x)
+        batch_size, seq_len = m.size()[:2]
+        indices = None
+        if not self.use_pad_tok_in_ffn:
+            assert unpad_input is not None
+            m, indices, _, _ = unpad_input(m, attention_mask)
+        n = self.ffn(m)
+        if not self.use_pad_tok_in_ffn:
+            assert pad_input is not None
+            n = pad_input(n, indices, batch_size, seq_len)
+        x = x + self.resid_ffn_dropout(n)
+        return (x, attn_weights, past_key_value)

config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "_name_or_path": "/home/user/shahrukh/models/responder_v2_mpt",
+  "architectures": [
+    "LlavaMPTForCausalLM"
+  ],
+  "attn_config": {
+    "alibi": true,
+    "alibi_bias_max": 8,
+    "attn_impl": "flash",
+    "attn_pdrop": 0.0,
+    "attn_type": "grouped_query_attention",
+    "attn_uses_sequence_id": false,
+    "clip_qkv": 6,
+    "kv_n_heads": 8,
+    "prefix_lm": false,
+    "qk_gn": false,
+    "qk_ln": false,
+    "rope": false,
+    "rope_dail_config": {
+      "pos_idx_in_fp32": true,
+      "type": "original",
+      "xpos_scale_base": 512
+    },
+    "rope_hf_config": {
+      "factor": 1.0,
+      "type": "no_scaling"
+    },
+    "rope_impl": "dail",
+    "rope_theta": 10000,
+    "sliding_window_size": -1,
+    "softmax_scale": null
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_mpt.MPTConfig",
+    "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
+  },
+  "d_model": 4608,
+  "emb_pdrop": 0.0,
+  "embedding_fraction": 1.0,
+  "expansion_ratio": 4,
+  "fc_type": "torch",
+  "ffn_config": {
+    "fc_type": "torch",
+    "ffn_type": "mptmlp"
+  },
+  "freeze_mm_mlp_adapter": false,
+  "hidden_size": 4608,
+  "image_aspect_ratio": "pad",
+  "image_grid_pinpoints": null,
+  "init_config": {
+    "emb_init_std": null,
+    "emb_init_uniform_lim": null,
+    "fan_mode": "fan_in",
+    "init_div_is_residual": true,
+    "init_gain": 0.0,
+    "init_nonlinearity": "relu",
+    "init_std": null,
+    "name": "kaiming_normal_"
+  },
+  "init_device": "cpu",
+  "learned_pos_emb": false,
+  "logit_scale": null,
+  "max_seq_len": 4096,
+  "mm_hidden_size": 1152,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "google/siglip-so400m-patch14-384",
+  "model_type": "mpt",
+  "n_heads": 48,
+  "n_layers": 32,
+  "no_bias": true,
+  "norm_type": "low_precision_layernorm",
+  "resid_pdrop": 0.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.37.0",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_pad_tok_in_ffn": true,
+  "vocab_size": 70400
+}

configuration_mpt.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""A HuggingFace-style model configuration."""
+import warnings
+from typing import Any, Dict, Optional, Union
+from transformers import PretrainedConfig
+from .attention import check_alibi_support, is_flash_v1_installed, is_flash_v2_installed
+from .blocks import attn_config_defaults
+from .fc import FC_CLASS_REGISTRY
+from .norm import LPLayerNorm
+from .ffn import FFN_CLASS_REGISTRY
+from .warnings import VersionedDeprecationWarning
+ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
+init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: Union[int, float]=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', tie_word_embeddings: bool=True, use_pad_tok_in_ffn: bool=True, **kwargs: Any):
+        """The MPT configuration class.
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            expansion_ratio (Union[int, float]): The ratio of the up/down scale in the ffn.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict): A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
+                attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                    this value.
+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                    use the default scale of ``1/sqrt(d_keys)``.
+                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
+                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
+                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                    which sub-sequence each token belongs to.
+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                sliding_window_size (int): Window size for sliding window local attention. Defaults to -1, which means no sliding window. Query at position i will only attend to keys between [i + seqlen_k - seqlen_q - window_size, i + seqlen_k - seqlen_q + window_size] inclusive. Only works for flash attention v2.3.0 or higher.
+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
+                alibi_bias_max (int): The maximum value of the alibi bias.
+                rope (bool): Whether to use rotary positional embeddings.
+                rope_theta (int): The base frequency for rope.
+                rope_impl (str): The implementation of rope to use. One of 'hf' (to use the implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) or 'dail' (to use the implementation from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py).
+                rope_dail_config (Dict): The configuration for the dail implementation of rope.
+                    type (str): The type of rotary position embedding to use. Options: 'original' (for https://arxiv.org/pdf/2104.09864.pdf), 'xpos' (for https://arxiv.org/pdf/2212.10554.pdf).
+                    pos_idx_in_fp32 (bool): If True, the position indices [0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. A consequence could be, for example, that bf16 rounds position 1995 to 2000, which leads to them having the same positional embedding.
+                    xpos_scale_base (float): The scale base for XPos (if using XPos).
+                rope_hf_config (Dict): A dictionary used to configure rope's scaling behavior (when scaling beyond the training length).
+                    type (str): Can be one of 'no_scaling', 'linear', or 'dynamic'. 'no_scaling' uses the default implementation for rotary embeddings, 'linear' uses linear scaling as proposed by the Reddit user /u/kaiokendev, and 'dynamic' uses Dynamic NTK scaling as proposed by the Reddit users /u/bloc97 and /u/emozilla.
+                    factor (float): Scaling factor to use if using 'linear' or 'dynamic' as rope_scaling.type.
+                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+            ffn_config (Dict): A dictionary used to configure the model's ffn module:
+                ffn_type (str): type of ffn to use. Options: mptmlp, mptglu, te_ln_mlp
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            norm_type (str): choose type of norm to use
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+            init_config (Dict): A dictionary used to configure the model initialization:
+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                    if using the baseline_ parameter initialization scheme.
+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+                ---
+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
+            tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
+            use_pad_tok_in_ffn (bool): Whether to forward the pad token in the feedforward networks.
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.ffn_config = ffn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        self.fc_type = fc_type
+        self.use_pad_tok_in_ffn = use_pad_tok_in_ffn
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False) or self.attn_config.get('rope', False):
+            self.learned_pos_emb = False
+            warnings.warn(f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`')
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self._validate_config()
+    def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]:
+        for k, v in config_defaults.items():
+            if k not in config:
+                config[k] = v
+            elif isinstance(v, dict):
+                config[k] = self._set_config_defaults(config[k] if config[k] is not None else {}, v)
+        return config
+    def _validate_config(self) -> None:
+        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
+            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
+            warnings.warn(VersionedDeprecationWarning('Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.', remove_version='0.6.0'))
+        if self.attn_config['attn_impl'] == 'triton' and (not self.attn_config['prefix_lm']):
+            warnings.warn(UserWarning('If not using a Prefix Language Model, we recommend setting "attn_impl" to "flash" instead of "triton".'))
+        # if self.attn_config['alibi'] and (not check_alibi_support(self.attn_config['attn_impl'])):
+        #     raise NotImplementedError('alibi only implemented with torch, triton, and flash (v2.4.2 or higher) attention.')
+        if self.attn_config['attn_uses_sequence_id'] and (not (self.attn_config['attn_impl'] in ['torch', 'triton'] or (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.1.2')))):
+            raise NotImplementedError('attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] not in ['dail', 'hf']:
+            raise ValueError('If rope is being used then rope_impl should be either "dail", or "hf".')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'hf' and (self.attn_config['rope_hf_config']['type'] not in ['no_scaling', 'linear', 'dynamic']):
+            raise ValueError('If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".')
+        if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'dail':
+            if self.attn_config['rope_dail_config']['type'] not in ['original', 'xpos']:
+                raise ValueError('If using the dail implementation of rope, the type should be one of "original" or "xpos".')
+            if not is_flash_v2_installed(v2_version='2.0.1'):
+                raise ImportError('If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support')
+        if self.attn_config['sliding_window_size'] != -1 and (not (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.3.0'))):
+            raise NotImplementedError('sliding window only implemented with flash attention v2.3.0 or higher.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
+        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
+        if not (self.learned_pos_emb or self.attn_config['alibi'] or self.attn_config['rope']):
+            warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi or rope.')
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                import transformer_engine.pytorch as te
+                del te
+            except:
+                raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' + 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
+        if self.ffn_config['ffn_type'] == 'mptgeglu':
+            raise ValueError('API CHANGE: `ffn_type=="mptgeglu"` changed to `ffn_type=="mptglu"`. ' + 'See [#829](https://github.com/mosaicml/llm-foundry/pull/829) for details.')
+        elif self.ffn_config['ffn_type'] in ['mptmlp', 'mptglu']:
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias
+            if 'ffn_act_fn' in self.ffn_config.keys():
+                raise ValueError(f'Transformer Engine block does not support custom activation functions.')
+        if not self.use_pad_tok_in_ffn:
+            try:
+                from flash_attn.bert_padding import unpad_input, pad_input
+            except:
+                raise ImportError('In order to set `use_pad_tok_in_ffn=False`, please install flash-attn==1.0.9 or flash-attn==2.3.6')

fc.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from torch import nn
+FC_CLASS_REGISTRY = {'torch': nn.Linear}
+try:
+    import transformer_engine.pytorch as te
+    FC_CLASS_REGISTRY['te'] = te.Linear
+except:
+    pass

ffn.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""MPT Blocks used for the MPT Model."""
+import logging
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.nn as nn
+from .fc import FC_CLASS_REGISTRY
+try:
+    import transformer_engine.pytorch as te
+except:
+    te = None
+log = logging.getLogger(__name__)
+_FFN_ACT_FN_DEFAULT = {'name': 'gelu', 'approximate': 'none'}
+def resolve_ffn_act_fn(config: Optional[dict]=None) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Resolve the activation function for the feed-forward network.
+    Args:
+        config (Optional[dict]): The configuration dictionary for the activation function.
+            The dict config must specify the 'name' of a torch.nn.functional activation
+            function. All of other key values pairs are bound to the function as a partial.
+    Returns:
+        Callable[[torch.Tensor], torch.Tensor]: The activation function.
+    """
+    if config is None:
+        config = _FFN_ACT_FN_DEFAULT
+    config = deepcopy(config)
+    name = config.pop('name')
+    if not hasattr(torch.nn.functional, name):
+        raise ValueError(f'Unrecognised activation function name ({name}).')
+    act = getattr(torch.nn.functional, name)
+    return partial(act, **config)
+_DEFAULT_ACT_FN = resolve_ffn_act_fn(_FFN_ACT_FN_DEFAULT)
+def resolve_ffn_hidden_size(d_model: int, expansion_ratio: Union[int, float], ffn_hidden_size: Optional[int]=None) -> int:
+    """Resolve the hidden size of the feed-forward network.
+    Args:
+        d_model (int): The dimension of the input and output of the feed-forward network.
+        expansion_ratio (Union[int, float]): The expansion ratio of the feed-forward network.
+        ffn_hidden_size (Optional[int]): The hidden size of the feed-forward network.
+    Returns:
+        int: The hidden size of the feed-forward network.
+    """
+    if ffn_hidden_size is not None:
+        log.info(f'`expansion_ratio` (={expansion_ratio}) ignored when `ffn_hidden_size` (={ffn_hidden_size}) is specified.')
+    else:
+        ffn_hidden_size = int(d_model * expansion_ratio)
+        if ffn_hidden_size != d_model * expansion_ratio:
+            raise ValueError(f'`d_model * expansion_ratio` must be an integer (d_model={d_model!r}; expansion_ratio={expansion_ratio!r}; d_model * expansion_ratio={d_model * expansion_ratio!r}).')
+    return ffn_hidden_size
+class MPTMLP(nn.Module):
+    def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
+        super().__init__()
+        ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
+        self.fc_kwargs: dict[str, Any] = {'bias': bias}
+        self.fc_kwargs['device'] = device
+        self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, ffn_hidden_size, **self.fc_kwargs)
+        self.act = act_fn
+        self.down_proj = FC_CLASS_REGISTRY[fc_type](ffn_hidden_size, d_model, **self.fc_kwargs)
+        self.down_proj._is_residual = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
+class MPTGLU(MPTMLP):
+    def __init__(self, d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, act_fn: Callable[[torch.Tensor], torch.Tensor]=_DEFAULT_ACT_FN, device: Optional[str]=None, bias: bool=True):
+        super().__init__(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, act_fn=act_fn, device=device, bias=bias)
+        self.gate_proj = FC_CLASS_REGISTRY[fc_type](d_model, self.up_proj.out_features, **self.fc_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
+FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP, 'mptglu': MPTGLU}
+if te is not None:
+    te.LayerNormMLP._has_norm = True
+    FFN_CLASS_REGISTRY['te_ln_mlp'] = te.LayerNormMLP
+def build_ffn(d_model: int, expansion_ratio: Union[int, float], fc_type: str='torch', ffn_hidden_size: Optional[int]=None, ffn_act_fn: Optional[dict]=None, device: Optional[str]=None, bias: bool=True, **kwargs: Any) -> nn.Module:
+    ffn_type = kwargs.pop('ffn_type')
+    if ffn_type in ['mptmlp', 'mptglu']:
+        if len(kwargs) > 0:
+            raise ValueError(f'MPTMLP (or MPTGLU) got an unexpected keyword argument: {kwargs}')
+        return FFN_CLASS_REGISTRY[ffn_type](d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, act_fn=resolve_ffn_act_fn(ffn_act_fn), ffn_hidden_size=ffn_hidden_size, device=device, bias=bias)
+    elif ffn_type == 'te_ln_mlp':
+        assert te is not None
+        ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
+        if ffn_act_fn is not None:
+            raise ValueError(f'Transformer Engine block does not support custom activation functions.')
+        return te.LayerNormMLP(hidden_size=d_model, ffn_hidden_size=ffn_hidden_size, bias=bias, **kwargs)
+    raise ValueError(f'ffn_type={ffn_type!r} not recognized.')

flash_attn_triton.py ADDED Viewed

	@@ -0,0 +1,484 @@

+"""
+Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
+update imports to use 'triton_pre_mlir'
+*Experimental* implementation of FlashAttention in Triton.
+Tested with triton==2.0.0.dev20221202.
+Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
+other than 64:
+https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
+We'll update this implementation with the new Triton backend once this is fixed.
+We use the FlashAttention implementation from Phil Tillet a starting point.
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+Changes:
+- Implement both causal and non-causal attention.
+- Implement both self-attention and cross-attention.
+- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
+- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
+- Support attention bias.
+- Speed up the forward pass a bit, and only store the LSE instead of m and l.
+- Make the backward for d=128 much faster by reducing register spilling.
+- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
+small batch size * nheads.
+Caution:
+- This is an *experimental* implementation. The forward pass should be quite robust but
+I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
+- This implementation has only been tested on A100.
+- If you plan to use headdim other than 64 and 128, you should test for race conditions
+(due to the Triton compiler), as done in tests/test_flash_attn.py
+"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
+for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
+that there are none left for other head dimensions.
+Differences between this Triton version and the CUDA version:
+- Triton version doesn't support dropout.
+- Triton forward is generally faster than CUDA forward, while Triton backward is
+generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
+than CUDA forward + backward.
+- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
+- Triton version supports attention bias, while CUDA version doesn't.
+"""
+import math
+import torch
+import triton_pre_mlir as triton
+import triton_pre_mlir.language as tl
+@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
+@triton.jit
+def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    if BIAS_TYPE == 'vector':
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
+    elif BIAS_TYPE == 'matrix':
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
+    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
+    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    elif EVEN_HEADDIM:
+        q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
+    else:
+        q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
+    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(0, end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
+        elif EVEN_HEADDIM:
+            k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        if not EVEN_N:
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float('-inf'))
+        if IS_CAUSAL:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float('-inf'))
+        if BIAS_TYPE != 'none':
+            if BIAS_TYPE == 'vector':
+                if EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == 'matrix':
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
+            qk = qk * softmax_scale + bias
+            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp(qk - m_ij[:, None])
+        else:
+            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        acc_o_scale = tl.exp(m_i - m_ij)
+        tl.store(t_ptrs, acc_o_scale)
+        acc_o_scale = tl.load(t_ptrs)
+        acc_o = acc_o * acc_o_scale[:, None]
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
+        elif EVEN_HEADDIM:
+            v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
+        p = p.to(v.dtype)
+        acc_o += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    tl.store(t_ptrs, o_scale)
+    o_scale = tl.load(t_ptrs)
+    acc_o = acc_o * o_scale[:, None]
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
+    tl.store(lse_ptrs, lse_i)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o)
+        else:
+            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
+    elif EVEN_HEADDIM:
+        tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
+    else:
+        tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
+@triton.jit
+def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
+    do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
+@triton.jit
+def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    elif EVEN_HEADDIM:
+        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+    else:
+        tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+        tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+@triton.jit
+def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+    begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
+    offs_qm = begin_m + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
+    if BIAS_TYPE == 'vector':
+        b_ptrs = Bias + offs_n
+    elif BIAS_TYPE == 'matrix':
+        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if begin_m >= seqlen_q:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+        _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
+        return
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    elif EVEN_HEADDIM:
+        k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+    else:
+        k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
+        v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
+    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        if EVEN_M & EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        elif EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+        else:
+            q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
+        qk = tl.dot(q, k, trans_b=True)
+        if not EVEN_N:
+            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
+        if IS_CAUSAL:
+            qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float('-inf'))
+        if BIAS_TYPE != 'none':
+            tl.debug_barrier()
+            if BIAS_TYPE == 'vector':
+                if EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == 'matrix':
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
+            qk = qk * softmax_scale + bias
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        lse_i = tl.load(LSE + offs_m_curr)
+        if BIAS_TYPE == 'none':
+            p = tl.exp(qk * softmax_scale - lse_i[:, None])
+        else:
+            p = tl.exp(qk - lse_i[:, None])
+        if EVEN_M & EVEN_HEADDIM:
+            do = tl.load(do_ptrs)
+        else:
+            do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
+        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        dp = tl.dot(do, v, trans_b=True)
+        if not EVEN_HEADDIM:
+            tl.debug_barrier()
+        Di = tl.load(D + offs_m_curr)
+        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
+        dk += tl.dot(ds, q, trans_a=True)
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        if not ATOMIC_ADD:
+            if EVEN_M & EVEN_HEADDIM:
+                dq = tl.load(dq_ptrs, eviction_policy='evict_last')
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, eviction_policy='evict_last')
+            elif EVEN_HEADDIM:
+                dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy='evict_last')
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy='evict_last')
+            else:
+                dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy='evict_last')
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy='evict_last')
+        else:
+            dq = tl.dot(ds, k)
+            if EVEN_M & EVEN_HEADDIM:
+                tl.atomic_add(dq_ptrs, dq)
+            elif EVEN_HEADDIM:
+                tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
+            else:
+                tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
+        dq_ptrs += BLOCK_M * stride_dqm
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if BIAS_TYPE == 'matrix':
+            b_ptrs += BLOCK_M * stride_bm
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+    _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+@triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
+@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
+@triton.jit
+def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    Q += off_b * stride_qb + off_h * stride_qh
+    K += off_b * stride_kb + off_h * stride_kh
+    V += off_b * stride_vb + off_h * stride_vh
+    DO += off_b * stride_dob + off_h * stride_doh
+    DQ += off_b * stride_dqb + off_h * stride_dqh
+    DK += off_b * stride_dkb + off_h * stride_dkh
+    DV += off_b * stride_dvb + off_h * stride_dvh
+    if BIAS_TYPE != 'none':
+        Bias += off_b * stride_bb + off_h * stride_bh
+    D += off_hb * seqlen_q_rounded
+    LSE += off_hb * seqlen_q_rounded
+    if not SEQUENCE_PARALLEL:
+        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
+        for start_n in range(0, num_block_n):
+            _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
+    else:
+        start_n = tl.program_id(0)
+        _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=True, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
+def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    assert k.shape == (batch, seqlen_k, nheads, d)
+    assert v.shape == (batch, seqlen_k, nheads, d)
+    assert d <= 128, 'FlashAttention only support head dimensions up to 128'
+    assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
+    assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    has_bias = bias is not None
+    bias_type = 'none'
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        if bias.stride(-1) != 1:
+            bias = bias.contiguous()
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = 'vector'
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = 'matrix'
+        else:
+            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    o = torch.empty_like(q)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    BLOCK = 128
+    num_warps = 4 if d <= 64 else 8
+    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
+    _fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
+    return (o, lse, softmax_scale)
+def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    assert d <= 128
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    assert lse.shape == (batch, nheads, seqlen_q_rounded)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    dq_accum = torch.empty_like(q, dtype=torch.float32)
+    delta = torch.empty_like(lse)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
+    _bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
+    has_bias = bias is not None
+    bias_type = 'none'
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.stride(-1) == 1
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = 'vector'
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = 'matrix'
+        else:
+            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
+    _bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM)
+    dq.copy_(dq_accum)
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        """
+            qkv: (batch, seqlen, 3, nheads, headdim)
+            bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
+                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
+                ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
+        """
+        if qkv.stride(-1) != 1:
+            qkv = qkv.contiguous()
+        o, lse, ctx.softmax_scale = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
+        ctx.save_for_backward(qkv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        qkv, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
+        with torch.inference_mode():
+            dqkv = torch.empty_like(qkv)
+            _flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
+        return (dqkv, None, None, None)
+flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
+        """
+            q: (batch, seqlen_q, nheads, headdim)
+            kv: (batch, seqlen_k, 2, nheads, headdim)
+            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
+        ctx.save_for_backward(q, kv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, kv, o, lse, bias = ctx.saved_tensors
+        if len(ctx.needs_input_grad) >= 3:
+            assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dkv = torch.empty_like(kv)
+            _flash_attn_backward(do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
+        return (dq, dkv, None, None, None)
+flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        """
+            q: (batch_size, seqlen_q, nheads, headdim)
+            k, v: (batch_size, seqlen_k, nheads, headdim)
+            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
+        ctx.save_for_backward(q, k, v, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
+        return (dq, dk, dv, None, None, None)
+flash_attn_func = FlashAttnFunc.apply

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.37.0",
+  "use_cache": false
+}

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13e5889e2d0bd9fc4f86c918c07156f40200c866418aa548784312e707c3baa2
+size 135

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfcb6d87c86d5513b17fd861c590fd941e8ce829fe4fc012e3d2e43a4b2e3497
+size 135

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20d397660ace2312837d829ed480fe4ebe68fb97c50da9f5a27093185e30e2af
+size 135

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b83d079b601999c848b3f3bd369097a2e20d87662b01f3cc853b213961b2a98
+size 134

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,653 @@

+{
+  "metadata": {
+    "total_size": 15601489024
+  },
+  "weight_map": {
+    "transformer.blocks.0.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.0.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.0.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.0.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.0.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.0.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.1.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.10.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.10.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.10.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.10.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.10.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.10.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.11.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.12.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.13.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.14.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.15.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.16.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.17.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.18.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.19.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.2.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.2.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.2.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.2.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.2.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.2.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.20.attn.Wqkv.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.20.attn.out_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.20.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.20.ffn.up_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.20.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.20.norm_2.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.21.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.21.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.21.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.21.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.21.norm_1.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.21.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.22.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.23.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.24.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.25.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.26.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.27.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.28.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.29.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.3.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.3.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.3.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.3.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.3.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.3.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.30.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.30.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.30.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.30.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.30.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.30.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.attn.Wqkv.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.ffn.down_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.ffn.up_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.norm_1.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.31.norm_2.weight": "model-00003-of-00004.safetensors",
+    "transformer.blocks.4.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.4.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.4.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.4.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.4.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.4.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.5.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.6.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.7.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.ffn.down_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.8.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.9.attn.Wqkv.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.9.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.9.ffn.down_proj.weight": "model-00002-of-00004.safetensors",
+    "transformer.blocks.9.ffn.up_proj.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.9.norm_1.weight": "model-00001-of-00004.safetensors",
+    "transformer.blocks.9.norm_2.weight": "model-00001-of-00004.safetensors",
+    "transformer.mm_projector.0.bias": "model-00004-of-00004.safetensors",
+    "transformer.mm_projector.0.weight": "model-00004-of-00004.safetensors",
+    "transformer.mm_projector.2.bias": "model-00004-of-00004.safetensors",
+    "transformer.mm_projector.2.weight": "model-00004-of-00004.safetensors",
+    "transformer.norm_f.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.attention.in_proj_bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.attention.in_proj_weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.attention.out_proj.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.attention.out_proj.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.layernorm.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.layernorm.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.head.probe": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
+    "transformer.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors",
+    "transformer.wte.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_mpt.py ADDED Viewed

	@@ -0,0 +1,540 @@

+"""A simple, flexible implementation of a GPT model.
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+from __future__ import annotations
+import math
+import warnings
+from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .attention import is_flash_v1_installed, is_flash_v2_installed
+from .norm import NORM_CLASS_REGISTRY
+if is_flash_v2_installed():
+    try:
+        from flash_attn import bert_padding
+        from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
+    except Exception as e:
+        raise e
+if is_flash_v1_installed():
+    try:
+        from flash_attn import bert_padding
+    except Exception as e:
+        raise e
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.llama.modeling_llama import LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding as HFRotaryEmbedding
+from .attention import attn_bias_shape, build_attn_bias, gen_slopes
+from .blocks import MPTBlock
+from .custom_embedding import SharedEmbedding
+from .ffn import build_ffn as build_ffn
+from .configuration_mpt import MPTConfig
+from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
+from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
+from .meta_init_context import init_empty_weights
+from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
+from .act_ckpt import pass_on_block_idx, build_act_ckpt_mod_to_blocks, check_mapping_blocks_overlap
+try:
+    from .flash_attn_triton import flash_attn_func as flash_attn_func
+except:
+    pass
+import logging
+log = logging.getLogger(__name__)
+def gen_rotary_embedding(rope_head_dim: int, rope_impl: str, rope_theta: int, rope_dail_config: dict, rope_hf_config: dict, max_seq_len: int):
+    if rope_impl == 'dail':
+        return DAILRotaryEmbedding(dim=rope_head_dim, base=rope_theta, interleaved=False, scale_base=rope_dail_config['xpos_scale_base'] if rope_dail_config['type'] == 'xpos' else None, pos_idx_in_fp32=rope_dail_config['pos_idx_in_fp32'], device='cpu')
+    elif rope_impl == 'hf':
+        if rope_hf_config['type'] == 'no_scaling':
+            return HFRotaryEmbedding(rope_head_dim, max_position_embeddings=max_seq_len, base=rope_theta, device='cpu')
+        elif rope_hf_config['type'] == 'linear':
+            return HFLinearScalingRotaryEmbedding(rope_head_dim, max_position_embeddings=max_seq_len, base=rope_theta, scaling_factor=rope_hf_config['factor'], device='cpu')
+        elif rope_hf_config['type'] == 'dynamic':
+            return HFDynamicNTKScalingRotaryEmbedding(rope_head_dim, max_position_embeddings=max_seq_len, base=rope_theta, scaling_factor=rope_hf_config['factor'], device='cpu')
+    raise ValueError('rope_impl needs to be either dail or hf')
+def gen_attention_mask_in_length(sequence_id: Union[None, torch.Tensor], S: int, attn_uses_sequence_id: bool, attn_impl: str, attention_mask: Union[torch.Tensor, None]):
+    """Generates the attention mask used for sequence masking in FA v2.
+    Only supports sequence id based sparse attention for no attention masking or attention masking with right padding.
+    In case of left padding:
+        1. Training with left padding is not supported in MPT (see https://github.com/mosaicml/llm-foundry/blob/1eecd4cb8e734499f77f6a35f657b8b20c0adfcb/llmfoundry/models/mpt/modeling_mpt.py#L407).
+        2. For generation with left padding, we only have a single sequence id per sample, so we don't need sequence id based sparse attention.
+    Args:
+        sequence_id (Union[None, torch.Tensor]): Tensor containing the sequence id for each token. Shape (batch_size, seq_len).
+        S (int): Sequence length
+        attn_uses_sequence_id (bool): Whether the attention uses sequence id based masking.
+        attn_impl (str): Attention implementation. This function is only creates attention_mask_in_length for flash attention.
+        attention_mask (Union[torch.Tensor, None]): Attention mask tensor of shape (batch_size, seq_len)
+    Returns:
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none. For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+            ```
+            [
+            [2, 3, 0, 0, 0, 0],
+            [3, 2, 0, 0, 0, 0],
+            [6, 0, 0, 0, 0, 0]
+            ]
+            ```
+        , which refers to the 3D-attention mask:
+            ```
+            [
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [0, 0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 1, 0, 0],
+                [1, 1, 1, 1, 1, 0],
+                [1, 1, 1, 1, 1, 1]
+            ]
+            ]
+            ```.
+            (The description above is taken verbatim from https://github.com/Dao-AILab/flash-attention/blob/9356a1c0389660d7e231ff3163c1ac17d9e3824a/flash_attn/bert_padding.py#L125 .)
+    """
+    attention_mask_in_length = None
+    if sequence_id is not None and attn_uses_sequence_id and (attn_impl == 'flash'):
+        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('Left padding is not supported with flash attention when attn_uses_sequence_id is set to True.')
+        if S != sequence_id.shape[-1]:
+            raise ValueError(f'Sequence length ({S}) does not match length of sequences in sequence_id ({sequence_id.shape[-1]}).')
+        if attention_mask is not None:
+            sequence_id = sequence_id.masked_fill(~attention_mask, 0)
+        attention_mask_in_length = torch.nn.functional.one_hot(sequence_id)
+        if attention_mask is not None:
+            attention_mask_in_length = attention_mask_in_length.masked_fill(~attention_mask.unsqueeze(-1), 0)
+        attention_mask_in_length = attention_mask_in_length.sum(dim=1)
+        attention_mask_in_length = torch.nn.functional.pad(attention_mask_in_length, (0, S - attention_mask_in_length.shape[-1]), mode='constant', value=0)
+    return attention_mask_in_length
+def gen_flash_attn_padding_info(bsz: int, S: int, past_key_len: int, device: torch.device, attention_mask_in_length: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None):
+    flash_attn_padding_info = {}
+    if attention_mask_in_length is None:
+        key_padding_mask = attention_mask
+        if key_padding_mask is None:
+            key_padding_mask = torch.ones((bsz, past_key_len + S), dtype=torch.bool, device=device)
+        query_padding_mask = key_padding_mask[:, -S:]
+        unpadding_function = bert_padding.unpad_input
+    else:
+        key_padding_mask = attention_mask_in_length
+        query_padding_mask = attention_mask_in_length
+        unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
+    _, indices_q, cu_seqlens_q, max_seqlen_q = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
+    _, indices_k, cu_seqlens_k, max_seqlen_k = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    _, indices_v, _, _ = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    flash_attn_padding_info['indices_q'] = indices_q
+    flash_attn_padding_info['indices_k'] = indices_k
+    flash_attn_padding_info['indices_v'] = indices_v
+    flash_attn_padding_info['cu_seqlens_q'] = cu_seqlens_q
+    flash_attn_padding_info['cu_seqlens_k'] = cu_seqlens_k
+    flash_attn_padding_info['max_seqlen_q'] = max_seqlen_q
+    flash_attn_padding_info['max_seqlen_k'] = max_seqlen_k
+    return flash_attn_padding_info
+def apply_sequence_id(attn_bias: torch.Tensor, sequence_id: torch.LongTensor, max_seq_len: int) -> torch.Tensor:
+    seq_len = sequence_id.shape[-1]
+    if seq_len > max_seq_len:
+        raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={max_seq_len}')
+    attn_bias = attn_bias[..., :seq_len, :seq_len]
+    cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
+    min_val = torch.finfo(attn_bias.dtype).min
+    attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+    return attn_bias
+class MPTPreTrainedModel(PreTrainedModel):
+    config_class = MPTConfig
+    base_model_prefix = 'model'
+    _no_split_modules = ['MPTBlock']
+def _fsdp_wrap_fn(self: Union[MPTModel, MPTForCausalLM], module: nn.Module) -> bool:
+    return isinstance(module, MPTBlock)
+class MPTModel(MPTPreTrainedModel):
+    def __init__(self, config: MPTConfig):
+        config._validate_config()
+        super().__init__(config)
+        self.attn_impl = config.attn_config['attn_impl']
+        self.prefix_lm = config.attn_config['prefix_lm']
+        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
+        self.alibi = config.attn_config['alibi']
+        self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        self.learned_pos_emb = config.learned_pos_emb
+        if config.init_device == 'mixed':
+            if dist.get_local_rank() == 0:
+                config.init_device = 'cpu'
+            else:
+                config.init_device = 'meta'
+        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
+            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
+            raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
+        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
+        self.embedding_fraction = config.embedding_fraction
+        self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
+        if self.learned_pos_emb:
+            self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
+        self.emb_drop = nn.Dropout(config.emb_pdrop)
+        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
+        for i, block in enumerate(self.blocks):
+            block.block_idx = i
+            block.max_block_idx = config.n_layers - 1
+            pass_on_block_idx(block)
+        self.norm_f = norm_class(config.d_model, device=config.init_device)
+        self.rope = config.attn_config['rope']
+        self.rope_impl = None
+        if self.rope:
+            self.rope_impl = config.attn_config['rope_impl']
+            self.rotary_embedding = gen_rotary_embedding(rope_head_dim=config.d_model // config.n_heads, rope_impl=self.rope_impl, rope_theta=config.attn_config['rope_theta'], rope_dail_config=config.attn_config['rope_dail_config'], rope_hf_config=config.attn_config['rope_hf_config'], max_seq_len=self.config.max_seq_len)
+        if config.init_device != 'meta':
+            log.info(f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.')
+            self.apply(self.param_init_fn)
+        self.is_causal = not self.prefix_lm
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
+                    log.info(f'Removing bias from module={module!r}.')
+                    module.register_parameter('bias', None)
+                if hasattr(module, 'use_bias'):
+                    log.info(f'Setting use_bias=False for module={module!r}.')
+                    module.use_bias = False
+        log.debug(self)
+        log.debug(f"Using {self.config.init_config['name']} initialization.")
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.wte
+    def set_input_embeddings(self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
+        self.wte = value
+    @torch.no_grad()
+    def _attn_bias(self, device: torch.device, dtype: torch.dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None) -> Tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]:
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
+                self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
+            self._attn_bias_initialized = True
+        if self.attn_impl == 'flash':
+            return (self.attn_bias, attention_mask)
+        if self.attn_bias is not None:
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)
+            assert isinstance(prefix_mask, torch.Tensor)
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)
+            attn_bias = apply_sequence_id(attn_bias, sequence_id, self.config.max_seq_len)
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
+            else:
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
+            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
+                raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
+        return (attn_bias, attention_mask)
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor) -> torch.Tensor:
+        s_k, s_q = attn_bias.shape[-2:]
+        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
+            raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None) -> BaseModelOutputWithPast:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if prefix_mask is not None:
+            prefix_mask = prefix_mask.bool()
+        if not return_dict:
+            raise NotImplementedError('return_dict False is not implemented yet for MPT')
+        if output_attentions:
+            if self.attn_impl != 'torch':
+                raise NotImplementedError('output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.')
+        if self.training and attention_mask is not None and (attention_mask[:, 0].sum() != attention_mask.shape[0]):
+            raise NotImplementedError('MPT does not support training with left padding.')
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
+            elif self.attn_uses_sequence_id is False and sequence_id is not None:
+                warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds.')
+        elif input_ids is not None:
+            bsz = input_ids.size(0)
+            S = input_ids.size(1)
+            x = self.wte(input_ids)
+            input_device = input_ids.device
+        elif inputs_embeds is not None:
+            bsz = inputs_embeds.size(0)
+            S = inputs_embeds.size(1)
+            x = inputs_embeds
+            input_device = inputs_embeds.device
+        else:
+            raise ValueError('You must specify input_ids or inputs_embeds')
+        #assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
+        rotary_emb_w_meta_info = None
+        past_position = 0
+        if past_key_values is not None:
+            if len(past_key_values) != self.config.n_layers:
+                raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
+            past_position = past_key_values[0][0].size(1)
+            if self.attn_impl == 'torch':
+                past_position = past_key_values[0][0].size(3)
+        if self.learned_pos_emb or self.rope:
+            if self.learned_pos_emb and S + past_position > self.config.max_seq_len:
+                raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length ' + f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
+            if self.learned_pos_emb or (self.rope and self.rope_impl == 'hf'):
+                pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_device).unsqueeze(0)
+                if attention_mask is not None:
+                    pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
+                if self.learned_pos_emb:
+                    x = x + self.wpe(pos)
+                elif self.rope and self.rope_impl == 'hf':
+                    rotary_emb_w_meta_info = {'impl': self.rope_impl, 'rotary_emb': self.rotary_embedding, 'offset_info': pos, 'seq_len': S + past_position}
+            elif self.rope and self.rope_impl == 'dail':
+                rotary_emb_w_meta_info = {'impl': self.rope_impl, 'rotary_emb': self.rotary_embedding, 'offset_info': past_position, 'seq_len': S + past_position}
+        if self.embedding_fraction == 1:
+            x = self.emb_drop(x)
+        else:
+            x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
+            assert isinstance(self.emb_drop, nn.Module)
+            x = self.emb_drop(x_shrunk)
+        attn_bias, attention_mask = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
+        attention_mask_in_length = gen_attention_mask_in_length(sequence_id=sequence_id, S=S, attn_uses_sequence_id=self.attn_uses_sequence_id, attn_impl=self.attn_impl, attention_mask=attention_mask)
+        alibi_slopes = None
+        if self.alibi and self.attn_impl == 'flash':
+            alibi_slopes = gen_slopes(n_heads=self.config.n_heads, alibi_bias_max=self.alibi_bias_max, device=x.device, return_1d=True)
+        presents = () if use_cache else None
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)]
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        flash_attn_padding_info = {}
+        if self.attn_impl == 'flash':
+            flash_attn_padding_info = gen_flash_attn_padding_info(bsz, S, past_position, x.device, attention_mask_in_length, attention_mask)
+        for b_idx, block in enumerate(self.blocks):
+            if output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            x, attn_weights, present = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
+            if presents is not None:
+                presents += (present,)
+            if output_attentions:
+                assert all_self_attns is not None
+                all_self_attns = all_self_attns + (attn_weights,)
+        x = self.norm_f(x)
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attns)
+    def param_init_fn(self, module: nn.Module) -> None:
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+    def fsdp_wrap_fn(self, module: nn.Module) -> bool:
+        return _fsdp_wrap_fn(self, module)
+    def activation_checkpointing_fn(self, module: nn.Module) -> bool:
+        return isinstance(module, MPTBlock)
+class MPTForCausalLM(MPTPreTrainedModel):
+    def __init__(self, config: MPTConfig):
+        super().__init__(config)
+        log.info(f'Instantiating an MPTForCausalLM model from {__file__}')
+        self.transformer: MPTModel = MPTModel(config)
+        self.lm_head = None
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False, device=config.init_device)
+            self.lm_head._fsdp_wrap = True
+        for child in self.transformer.children():
+            if isinstance(child, torch.nn.ModuleList):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+            self.logit_scale = logit_scale
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.transformer.get_input_embeddings()
+    def set_input_embeddings(self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
+        self.transformer.set_input_embeddings(value)
+    def get_output_embeddings(self) -> Union[SharedEmbedding, nn.Embedding, nn.Linear]:
+        if self.lm_head is not None:
+            return self.lm_head
+        return self.transformer.get_input_embeddings()
+    def set_output_embeddings(self, new_embeddings: Union[SharedEmbedding, nn.Embedding, nn.Linear]) -> None:
+        if self.lm_head is not None:
+            self.lm_head = new_embeddings
+        else:
+            if not isinstance(new_embeddings, (SharedEmbedding, nn.Embedding)):
+                raise ValueError('new_embeddings must be an instance of SharedEmbedding ' + f'or nn.Embedding, but got {type(new_embeddings)}.')
+            warnings.warn('Using `set_output_embeddings` to set the embedding layer of ' + 'MPTForCausalLM with tied weights. Given weights are tied, ' + 'using `set_input_embeddings` is recommended over using ' + '`set_output_embeddings`.')
+            self.transformer.set_input_embeddings(new_embeddings)
+    def tie_weights(self) -> None:
+        self.lm_head = None
+    def set_decoder(self, decoder: MPTModel) -> None:
+        self.transformer = decoder
+    def get_decoder(self) -> MPTModel:
+        return self.transformer
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None) -> CausalLMOutputWithPast:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
+        if self.lm_head is not None:
+            logits = self.lm_head(outputs.last_hidden_state)
+        else:
+            out = outputs.last_hidden_state
+            out = out.to(self.transformer.wte.weight.device)
+            logits = self.transformer.wte(out, True)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            _labels = torch.roll(labels, shifts=-1)
+            _labels[:, -1] = -100
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+    def param_init_fn(self, module: nn.Module) -> None:
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+    def fsdp_wrap_fn(self, module: nn.Module) -> bool:
+        return _fsdp_wrap_fn(self, module)
+    def activation_checkpointing_fn(self, module: nn.Module) -> bool:
+        """The MPT activation checkpointing (act ckpt) function.
+        When `activation_checkpointing` in fsdp_config is set to true, this function will be called on all the modules in the FSDP wrapped model and determine whether a given module should be activation checkpointed. It checks the checkpointing target (`activation_checkpointing_target` in `model`) which can be specified as below:
+            1. null (or no such field): The whole MPTBlock will be activation checkpointed on all layers
+            2. a list of modules to act ckpt on all layers, e.g.,
+                activation_checkpointing_target:
+                    - grouped_query_attention
+                    - mptmlp
+            3. a dictionary of module name with target_blocks, e.g.,
+                activation_checkpointing_target:
+                    {
+                            "mptblock": target_blocks_1,
+                            "grouped_query_attention": target_blocks_2
+                    }
+                target_blocks (target_blocks_1, target_blocks_2 above) can be:
+                - a single integer n: the first n transformer block will be activation checkpointed
+                - a string of first-n, middle-m, last-k, range-i-j: the first n, the middle m,  the last k, or the range [i, j) layers will be activation checkpointed. E.g, 'first-2, last-2' means the first 2 and last 2 transformer blocks will be activation checkpointed
+                    middle-m is range [start, end) where ``start = max(max_block_idx // 2 - m // 2, 0), end = min(start + m, max_block_idx + 1)``
+                - a list of integers corresponds to the list of transformer block ids, e.g., [2] means the second transformer block will be activation checkpointed. [2, 3] means the second and third transformer blocks will be activation checkpointed
+                - a list of mixed integers and strings of first-n, middle-m, last-k, range-i-j
+            An example in yaml config file:
+                fsdp_config:
+                    activation_checkpointing: true
+                model:
+                    activation_checkpointing_target:
+                        {
+                            "mptblock": 'first-5',
+                            "grouped_query_attention": 'last-35'
+                        }
+        """
+        if not hasattr(module, 'block_idx'):
+            log.debug(f'{module.__class__.__name__} cannot be activation checkpointed. Only transformer block or its submodules are eligible for activation checkpointing.')
+            return False
+        act_ckpt_target = getattr(self.config, 'activation_checkpointing_target', None)
+        act_ckpt_mod_to_blocks = build_act_ckpt_mod_to_blocks(act_ckpt_target, MPTBlock, module.max_block_idx)
+        check_mapping_blocks_overlap(act_ckpt_mod_to_blocks, module.max_block_idx)
+        for k in act_ckpt_mod_to_blocks.keys():
+            if isinstance(module, k):
+                blocks = act_ckpt_mod_to_blocks[k]
+                return True if blocks == -1 else module.block_idx in blocks
+        return False
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]=None, inputs_embeds: Optional[torch.Tensor]=None, **kwargs: Any) -> Dict[str, Any]:
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('MPT does not support generation with right padding.')
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get('use_cache') == False:
+                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
+        else:
+            prefix_mask = None
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update({'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)})
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]], beam_idx: torch.LongTensor) -> List[Tuple[torch.Tensor, ...]]:
+        """Used by HuggingFace generate when using beam search with kv-caching.
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
+        return reordered_past

norm.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Dict, List, Optional, Type, Union
+import torch
+def _cast_if_autocast_enabled(tensor: torch.Tensor) -> torch.Tensor:
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
+class LPLayerNorm(torch.nn.LayerNorm):
+    def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, elementwise_affine: bool=True, device: Optional[torch.device]=None, dtype: Optional[torch.dtype]=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
+def rms_norm(x: torch.Tensor, weight: Optional[torch.Tensor]=None, eps: float=1e-05) -> torch.Tensor:
+    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    if weight is not None:
+        return output * weight
+    return output
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
+        super().__init__()
+        self.eps = eps
+        if weight:
+            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
+        else:
+            self.register_parameter('weight', None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
+class LPRMSNorm(RMSNorm):
+    def __init__(self, normalized_shape: Union[int, List[int], torch.Size], eps: float=1e-05, weight: bool=True, dtype: Optional[torch.dtype]=None, device: Optional[torch.device]=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "additional_special_tokens": [
+    "<|SYSTEM|>",
+    "<|USER|>",
+    "<|RESPONSE|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1758 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70000": {
+      "content": "<unused0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70001": {
+      "content": "<unused1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70002": {
+      "content": "<unused2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70003": {
+      "content": "<unused3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70004": {
+      "content": "<unused4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70005": {
+      "content": "<unused5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70006": {
+      "content": "<unused6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70007": {
+      "content": "<unused7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70008": {
+      "content": "<unused8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70009": {
+      "content": "<unused9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70010": {
+      "content": "<unused10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70011": {
+      "content": "<unused11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70012": {
+      "content": "<unused12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70013": {
+      "content": "<unused13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70014": {
+      "content": "<unused14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70015": {
+      "content": "<unused15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70016": {
+      "content": "<unused16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70017": {
+      "content": "<unused17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70018": {
+      "content": "<unused18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70019": {
+      "content": "<unused19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70020": {
+      "content": "<unused20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70021": {
+      "content": "<unused21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70022": {
+      "content": "<unused22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70023": {
+      "content": "<unused23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70024": {
+      "content": "<unused24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70025": {
+      "content": "<unused25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70026": {
+      "content": "<unused26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70027": {
+      "content": "<unused27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70028": {
+      "content": "<unused28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70029": {
+      "content": "<unused29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70030": {
+      "content": "<unused30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70031": {
+      "content": "<unused31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70032": {
+      "content": "<unused32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70033": {
+      "content": "<unused33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70034": {
+      "content": "<unused34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70035": {
+      "content": "<unused35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70036": {
+      "content": "<unused36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70037": {
+      "content": "<unused37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70038": {
+      "content": "<unused38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70039": {
+      "content": "<unused39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70040": {
+      "content": "<unused40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70041": {
+      "content": "<unused41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70042": {
+      "content": "<unused42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70043": {
+      "content": "<unused43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70044": {
+      "content": "<unused44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70045": {
+      "content": "<unused45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70046": {
+      "content": "<unused46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70047": {
+      "content": "<unused47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70048": {
+      "content": "<unused48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70049": {
+      "content": "<unused49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70050": {
+      "content": "<unused50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70051": {
+      "content": "<unused51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70052": {
+      "content": "<unused52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70053": {
+      "content": "<unused53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70054": {
+      "content": "<unused54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70055": {
+      "content": "<unused55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70056": {
+      "content": "<unused56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70057": {
+      "content": "<unused57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70058": {
+      "content": "<unused58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70059": {
+      "content": "<unused59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70060": {
+      "content": "<unused60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70061": {
+      "content": "<unused61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70062": {
+      "content": "<unused62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70063": {
+      "content": "<unused63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70064": {
+      "content": "<unused64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70065": {
+      "content": "<unused65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70066": {
+      "content": "<unused66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70067": {
+      "content": "<unused67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70068": {
+      "content": "<unused68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70069": {
+      "content": "<unused69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70070": {
+      "content": "<unused70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70071": {
+      "content": "<unused71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70072": {
+      "content": "<unused72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70073": {
+      "content": "<unused73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70074": {
+      "content": "<unused74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70075": {
+      "content": "<unused75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70076": {
+      "content": "<unused76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70077": {
+      "content": "<unused77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70078": {
+      "content": "<unused78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70079": {
+      "content": "<unused79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70080": {
+      "content": "<unused80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70081": {
+      "content": "<unused81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70082": {
+      "content": "<unused82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70083": {
+      "content": "<unused83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70084": {
+      "content": "<unused84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70085": {
+      "content": "<unused85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70086": {
+      "content": "<unused86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70087": {
+      "content": "<unused87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70088": {
+      "content": "<unused88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70089": {
+      "content": "<unused89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70090": {
+      "content": "<unused90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70091": {
+      "content": "<unused91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70092": {
+      "content": "<unused92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70093": {
+      "content": "<unused93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70094": {
+      "content": "<unused94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70095": {
+      "content": "<unused95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70096": {
+      "content": "<unused96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70097": {
+      "content": "<unused97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70098": {
+      "content": "<unused98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70099": {
+      "content": "<unused99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70100": {
+      "content": "<unused100>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70101": {
+      "content": "<unused101>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70102": {
+      "content": "<unused102>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70103": {
+      "content": "<unused103>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70104": {
+      "content": "<unused104>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70105": {
+      "content": "<unused105>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70106": {
+      "content": "<unused106>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70107": {
+      "content": "<unused107>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70108": {
+      "content": "<unused108>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70109": {
+      "content": "<unused109>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70110": {
+      "content": "<unused110>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70111": {
+      "content": "<unused111>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70112": {
+      "content": "<unused112>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70113": {
+      "content": "<unused113>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70114": {
+      "content": "<unused114>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70115": {
+      "content": "<unused115>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70116": {
+      "content": "<unused116>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70117": {
+      "content": "<unused117>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70118": {
+      "content": "<unused118>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70119": {
+      "content": "<unused119>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70120": {
+      "content": "<unused120>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70121": {
+      "content": "<unused121>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70122": {
+      "content": "<unused122>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70123": {
+      "content": "<unused123>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70124": {
+      "content": "<unused124>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70125": {
+      "content": "<unused125>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70126": {
+      "content": "<unused126>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70127": {
+      "content": "<unused127>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70128": {
+      "content": "<unused128>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70129": {
+      "content": "<unused129>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70130": {
+      "content": "<unused130>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70131": {
+      "content": "<unused131>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70132": {
+      "content": "<unused132>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70133": {
+      "content": "<unused133>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70134": {
+      "content": "<unused134>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70135": {
+      "content": "<unused135>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70136": {
+      "content": "<unused136>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70137": {
+      "content": "<unused137>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70138": {
+      "content": "<unused138>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70139": {
+      "content": "<unused139>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70140": {
+      "content": "<unused140>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70141": {
+      "content": "<unused141>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70142": {
+      "content": "<unused142>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70143": {
+      "content": "<unused143>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70144": {
+      "content": "<unused144>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70145": {
+      "content": "<unused145>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70146": {
+      "content": "<unused146>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70147": {
+      "content": "<unused147>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70148": {
+      "content": "<unused148>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70149": {
+      "content": "<unused149>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70150": {
+      "content": "<unused150>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70151": {
+      "content": "<unused151>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70152": {
+      "content": "<unused152>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70153": {
+      "content": "<unused153>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70154": {
+      "content": "<unused154>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70155": {
+      "content": "<unused155>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70156": {
+      "content": "<unused156>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70157": {
+      "content": "<unused157>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70158": {
+      "content": "<unused158>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70159": {
+      "content": "<unused159>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70160": {
+      "content": "<unused160>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70161": {
+      "content": "<unused161>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70162": {
+      "content": "<unused162>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70163": {
+      "content": "<unused163>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70164": {
+      "content": "<unused164>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70165": {
+      "content": "<unused165>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70166": {
+      "content": "<unused166>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70167": {
+      "content": "<unused167>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70168": {
+      "content": "<unused168>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70169": {
+      "content": "<unused169>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70170": {
+      "content": "<unused170>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70171": {
+      "content": "<unused171>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70172": {
+      "content": "<unused172>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70173": {
+      "content": "<unused173>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70174": {
+      "content": "<unused174>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70175": {
+      "content": "<unused175>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70176": {
+      "content": "<unused176>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70177": {
+      "content": "<unused177>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70178": {
+      "content": "<unused178>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70179": {
+      "content": "<unused179>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70180": {
+      "content": "<unused180>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70181": {
+      "content": "<unused181>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70182": {
+      "content": "<unused182>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70183": {
+      "content": "<unused183>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70184": {
+      "content": "<unused184>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70185": {
+      "content": "<unused185>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70186": {
+      "content": "<unused186>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70187": {
+      "content": "<unused187>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70188": {
+      "content": "<unused188>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70189": {
+      "content": "<unused189>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70190": {
+      "content": "<unused190>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70191": {
+      "content": "<unused191>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70192": {
+      "content": "<unused192>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70193": {
+      "content": "<unused193>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70194": {
+      "content": "<unused194>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70195": {
+      "content": "<unused195>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70196": {
+      "content": "<unused196>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70197": {
+      "content": "<unused197>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70198": {
+      "content": "<unused198>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70199": {
+      "content": "<unused199>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70200": {
+      "content": "<unused200>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70201": {
+      "content": "<unused201>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70202": {
+      "content": "<unused202>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70203": {
+      "content": "<unused203>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70204": {
+      "content": "<unused204>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70205": {
+      "content": "<unused205>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70206": {
+      "content": "<unused206>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70207": {
+      "content": "<unused207>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70208": {
+      "content": "<unused208>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70209": {
+      "content": "<|SYSTEM|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70210": {
+      "content": "<|USER|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70211": {
+      "content": "<|RESPONSE|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|SYSTEM|>",
+    "<|USER|>",
+    "<|RESPONSE|>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb5a6bc0ff399e149a474126d4a0dc7a10affb4a6a9bb06cb2ac98e5b92022a5
+size 129

warnings.py ADDED Viewed

	@@ -0,0 +1,22 @@

+class VersionedDeprecationWarning(DeprecationWarning):
+    """A custom deprecation warning class that includes version information.
+    Attributes:
+        message (str): The deprecation message describing why the feature is deprecated.
+        remove_version (str): The version in which the feature will be removed.
+    Example:
+        >>> def deprecated_function():
+        ...     warnings.warn(
+        ...         VersionedDeprecationWarning(
+        ...             "Function XYZ is deprecated.",
+        ...             remove_version="2.0.0"
+        ...         )
+        ...     )
+        ...
+        >>> deprecated_function()
+        DeprecationWarning: Function XYZ is deprecated. It will be removed in version 2.0.0.
+    """
+    def __init__(self, message: str, remove_version: str) -> None:
+        super().__init__(message + f' It will be removed in version {remove_version}.')