Spaces:
Runtime error
Runtime error
| import os | |
| from typing import Dict, List, Optional, Tuple, Union | |
| import torch | |
| from langchain.llms.base import LLM | |
| from langchain.llms.utils import enforce_stop_tokens | |
| from transformers import AutoModel, AutoTokenizer | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| DEVICE = "cuda" | |
| DEVICE_ID = "0" | |
| CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE | |
| def torch_gc(): | |
| if torch.cuda.is_available(): | |
| with torch.cuda.device(CUDA_DEVICE): | |
| torch.cuda.empty_cache() | |
| torch.cuda.ipc_collect() | |
| def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: | |
| # transformer.word_embeddings 占用1层 | |
| # transformer.final_layernorm 和 lm_head 占用1层 | |
| # transformer.layers 占用 28 层 | |
| # 总共30层分配到num_gpus张卡上 | |
| num_trans_layers = 28 | |
| per_gpu_layers = 30 / num_gpus | |
| # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError | |
| # windows下 model.device 会被设置成 transformer.word_embeddings.device | |
| # linux下 model.device 会被设置成 lm_head.device | |
| # 在调用chat或者stream_chat时,input_ids会被放到model.device上 | |
| # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError | |
| # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上 | |
| device_map = {'transformer.word_embeddings': 0, | |
| 'transformer.final_layernorm': 0, 'lm_head': 0} | |
| used = 2 | |
| gpu_target = 0 | |
| for i in range(num_trans_layers): | |
| if used >= per_gpu_layers: | |
| gpu_target += 1 | |
| used = 0 | |
| assert gpu_target < num_gpus | |
| device_map[f'transformer.layers.{i}'] = gpu_target | |
| used += 1 | |
| return device_map | |
| class ChatLLM(LLM): | |
| max_token: int = 10000 | |
| temperature: float = 0.1 | |
| top_p = 0.9 | |
| history = [] | |
| tokenizer: object = None | |
| model: object = None | |
| def __init__(self): | |
| super().__init__() | |
| def _llm_type(self) -> str: | |
| return "ChatLLM" | |
| def _call(self, | |
| prompt: str, | |
| stop: Optional[List[str]] = None) -> str: | |
| if self.model == 'Minimax': | |
| import requests | |
| group_id = os.getenv('group_id') | |
| api_key = os.getenv('api_key') | |
| url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}' | |
| headers = { | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| request_body = { | |
| "model": "abab5-chat", | |
| "tokens_to_generate": 512, | |
| 'messages': [] | |
| } | |
| for i in self.history: | |
| h_input = i[0] | |
| h_reply = i[1] | |
| request_body['messages'].append({ | |
| "sender_type": "USER", | |
| "text": h_input | |
| }) | |
| request_body['messages'].append({"sender_type": "BOT", "text": h_reply}) | |
| request_body['messages'].append({"sender_type": "USER", "text": prompt}) | |
| resp = requests.post(url, headers=headers, json=request_body) | |
| response = resp.json()['reply'] | |
| # 将当次的ai回复内容加入messages | |
| request_body['messages'].append({"sender_type": "BOT", "text": response}) | |
| self.history.append((prompt, response)) | |
| else: | |
| response, _ = self.model.chat( | |
| self.tokenizer, | |
| prompt, | |
| history=self.history, | |
| max_length=self.max_token, | |
| temperature=self.temperature, | |
| ) | |
| torch_gc() | |
| if stop is not None: | |
| response = enforce_stop_tokens(response, stop) | |
| self.history = self.history+[[None, response]] | |
| return response | |
| def load_model(self, | |
| model_name_or_path: str = "THUDM/chatglm-6b-int4", | |
| llm_device=DEVICE, | |
| device_map: Optional[Dict[str, int]] = None, | |
| **kwargs): | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| model_name_or_path, | |
| trust_remote_code=True | |
| ) | |
| if torch.cuda.is_available() and llm_device.lower().startswith("cuda"): | |
| # 根据当前设备GPU数量决定是否进行多卡部署 | |
| num_gpus = torch.cuda.device_count() | |
| if num_gpus < 2 and device_map is None: | |
| self.model = ( | |
| AutoModel.from_pretrained( | |
| model_name_or_path, | |
| trust_remote_code=True, | |
| **kwargs) | |
| .half() | |
| .cuda() | |
| ) | |
| else: | |
| from accelerate import dispatch_model | |
| model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half() | |
| # 可传入device_map自定义每张卡的部署情况 | |
| if device_map is None: | |
| device_map = auto_configure_device_map(num_gpus) | |
| self.model = dispatch_model(model, device_map=device_map) | |
| else: | |
| self.model = ( | |
| AutoModel.from_pretrained( | |
| model_name_or_path, | |
| trust_remote_code=True) | |
| .float() | |
| .to(llm_device) | |
| ) | |
| self.model = self.model.eval() |