import torch from session import logger, log_sys_info from transformers import AutoTokenizer, GenerationConfig, AutoModel chatglm = 'THUDM/chatglm-6b' chatglm_rev = '4de8efe' int8_model = 'KumaTea/twitter-int8' int8_model_rev = '1136001' # import subprocess # result = subprocess.run(['git', 'clone', 'https://huggingface.co/KumaTea/twitter-int8', 'model'], capture_output=True, text=True) # print(result.stdout) # device = torch.device('cpu') # torch.cuda.current_device = lambda : device log_sys_info() model = AutoModel.from_pretrained( int8_model, trust_remote_code=True, revision=int8_model_rev ).float() # .to(device) tokenizer = AutoTokenizer.from_pretrained(chatglm, trust_remote_code=True, revision=chatglm_rev) # dump a log to ensure everything works well # print(model.peft_config) # We have to use full precision, as some tokens are >65535 model.eval() # print(model) torch.set_default_tensor_type(torch.FloatTensor) logger.info('[SYS] Model loaded') log_sys_info()