Spaces:
Build error
Build error
| # import packages | |
| import os | |
| from tqdm import tqdm | |
| import warnings | |
| import json | |
| import torch.nn.functional as F | |
| import torch | |
| import gc | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from datetime import datetime | |
| import argparse | |
| import mamba_ssm | |
| import rwkv | |
| RWKV4_TOKENIZER_FILE = "./support/20B_tokenizer.json" | |
| def load_list_from_json(file_path): | |
| """ | |
| Loads a list of strings from a JSON file. | |
| :param file_path: Path of the JSON file to be loaded. | |
| :return: List of strings loaded from the JSON file. | |
| """ | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return json.load(file) | |
| def calculate_log_sum(logits, target_token_ids): | |
| shifted_logits = logits[:-1, :] | |
| shifted_targets = target_token_ids[1:] | |
| log_probs = F.log_softmax(shifted_logits, dim=-1) | |
| target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze() | |
| # print(target_log_probs) | |
| log_sum = torch.sum(target_log_probs, dim=-1) | |
| # print(perplexity_sum) | |
| return log_sum.item() | |
| def print_model_parameters_in_billions(model): | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| total_params_billion = total_params / 1e9 | |
| print(f"Model parameters: {total_params_billion:.3f} billion") | |
| def make_log(data_dict, folder_path): | |
| if not os.path.exists(folder_path): | |
| try: | |
| os.makedirs(folder_path) | |
| print(f"Directory created at {folder_path}") | |
| except Exception as e: | |
| print(f"Error creating directory: {e}") | |
| return | |
| timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
| file_name = f"{timestamp}.json" | |
| file_path = os.path.join(folder_path, file_name) | |
| try: | |
| with open(file_path, 'w') as file: | |
| json.dump(data_dict, file, indent=4) | |
| print(f"Dictionary saved successfully to {file_path}") | |
| except Exception as e: | |
| print(f"Error saving dictionary: {e}") | |
| def load_rwkv(path): | |
| os.environ['RWKV_JIT_ON'] = '1' | |
| os.environ["RWKV_CUDA_ON"] = '1' | |
| from rwkv.model import RWKV | |
| from rwkv.utils import PIPELINE | |
| rwkv_model = RWKV(model=path, strategy='cuda fp16') | |
| rwkv_pipeline = PIPELINE(rwkv_model, r"rwkv_vocab_v20230424") | |
| rwkv_tokenizer = rwkv_pipeline.tokenizer | |
| return rwkv_model, rwkv_tokenizer | |
| def load_rwkv4pile(path): | |
| os.environ['RWKV_JIT_ON'] = '1' | |
| os.environ["RWKV_CUDA_ON"] = '1' | |
| from rwkv.model import RWKV | |
| from rwkv.utils import PIPELINE | |
| rwkv_model = RWKV(model=path, strategy='cuda fp16') | |
| rwkv_pipeline = PIPELINE(rwkv_model, RWKV4_TOKENIZER_FILE) | |
| rwkv_tokenizer = rwkv_pipeline.tokenizer | |
| return rwkv_model, rwkv_tokenizer | |
| def load_hf_model(path, cache_path): | |
| hf_tokenizer = AutoTokenizer.from_pretrained(path) | |
| if cache_path is not None: | |
| hf_model = AutoModelForCausalLM.from_pretrained(path, | |
| device_map="cuda", | |
| trust_remote_code=True, | |
| cache_dir=cache_path).eval() | |
| else: | |
| hf_model = AutoModelForCausalLM.from_pretrained(path, | |
| device_map="cuda", | |
| trust_remote_code=True).eval() | |
| print_model_parameters_in_billions(hf_model) | |
| return hf_model, hf_tokenizer | |
| def load_mamba(path): | |
| from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel | |
| mamba_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") | |
| mamba_model = MambaLMHeadModel.from_pretrained(path, device="cuda", dtype=torch.float16) | |
| mamba_model.device = torch.device('cuda') | |
| print_model_parameters_in_billions(mamba_model) | |
| return mamba_model, mamba_tokenizer | |
| def eval_rwkv(model, tokenizer, texts, chunk_size, v4pile=False): | |
| rwkv_test_data = [] | |
| rwkv_token_length_list = [] | |
| for idx, sample in tqdm(enumerate(texts), total=len(texts)): | |
| with torch.no_grad(): | |
| if v4pile: | |
| input_seq = tokenizer.encode(sample).ids # v4 | |
| else: | |
| input_seq = tokenizer.encode(sample) | |
| input_length = len(input_seq) | |
| neg_log_prob_temp = 0 | |
| # for begin in range(0, input_length, chunk_size): | |
| input_chunk = input_seq[:chunk_size] | |
| logit = model.forward(input_chunk, None, full_output=True)[0] | |
| if len(input_chunk) == 1: | |
| logit = logit.unsqueeze(0) | |
| # log_sum = calculate_log_sum(logit, torch.tensor(input_chunk).cuda()) | |
| # neg_log_prob_temp += log_sum | |
| # rwkv_token_length_list.append(input_length) | |
| # rwkv_test_data.append(neg_log_prob_temp) | |
| # data_dict = { | |
| # 'neg_log_prob_sum': sum(rwkv_test_data) / len(rwkv_test_data), | |
| # 'avg tokens': sum(rwkv_token_length_list) / len(rwkv_token_length_list), | |
| # } | |
| # print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data):.2f}') | |
| # print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list):.0f}') | |
| return logit,logit,input_chunk,tokenizer | |
| def eval_hf_model(model, tokenizer, texts, chunk_size): | |
| data = [] | |
| token_length_list = [] | |
| for idx, sample in tqdm(enumerate(texts), total=len(texts)): | |
| with torch.no_grad(): | |
| inputs = tokenizer(sample, return_tensors='pt') | |
| inputs = inputs.to(model.device) | |
| seq_length = inputs['input_ids'].shape[-1] | |
| neg_log_prob_temp = 0 | |
| # for begin in range(0, seq_length, chunk_size): | |
| input_chunk = inputs['input_ids'][:, :chunk_size] | |
| logit = model.forward(input_ids=input_chunk).logits[0, :, :] | |
| # log_sum = calculate_log_sum(logit, input_chunk.squeeze(0)) | |
| # neg_log_prob_temp += log_sum | |
| # token_length_list.append(seq_length) | |
| # data.append(neg_log_prob_temp) | |
| # data_dict = { | |
| # 'neg_log_prob_sum': sum(data) / len(data), | |
| # 'avg tokens': sum(token_length_list) / len(token_length_list), | |
| # } | |
| # print(f'log probability sum: {sum(data) / len(data):.2f}') | |
| # print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}') | |
| return logit,input_chunk,tokenizer | |
| # if __name__ == '__main__': | |
| # parser = argparse.ArgumentParser() | |
| # parser.add_argument('--model', type=str, required=True, help='model name or path') | |
| # parser.add_argument('--model_type', choices=['hf', 'rwkv', 'mamba', 'rwkv4pile'], required=True, help='model type') | |
| # parser.add_argument('--data', type=str, required=True, help='data path (json file)') | |
| # parser.add_argument('--log_path', type=str, default='./logs/', help='log file path') | |
| # parser.add_argument('--model_cache', type=str, help='hugging face model cache') | |
| # parser.add_argument('--chunk_size', type=int, default=1024, help='chunk size') | |
| def run_get_loss(args): | |
| # args = parser.parse_args() | |
| # load data | |
| texts = load_list_from_json(args.data) | |
| print(f'data size: {len(texts)}') | |
| # load model | |
| if args.model_type == 'hf': | |
| model, tokenizer = load_hf_model(args.model, args.model_cache)# tokenzier path, model path | |
| elif args.model_type == 'rwkv': | |
| model, tokenizer = load_rwkv(args.model) | |
| elif args.model_type == 'mamba': | |
| model, tokenizer = load_mamba(args.model) | |
| elif args.model_type == 'rwkv4pile': | |
| model, tokenizer = load_rwkv4pile(args.model) | |
| else: | |
| raise NotImplementedError | |
| # eval | |
| if args.model_type in ['hf', 'mamba']: | |
| return eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size) | |
| elif args.model_type == 'rwkv': | |
| return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size) | |
| elif args.model_type == 'rwkv4pile': | |
| return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size, v4pile=True) | |
| else: | |
| raise NotImplementedError | |
| # results['model_name_or_path'] = args.model | |
| # results['data_path'] = args.data | |
| # results['chunk_size'] = args.chunk_size | |
| # make_log(results, args.log_path) | |
| # print(json.dumps(results, indent=4, ensure_ascii=False)) | |
| from types import SimpleNamespace | |
| if __name__ == '__main__': | |
| args=SimpleNamespace(model='microsoft/phi-2',texts=['Hello FreshBench !'],model_type='hf',data='data.json',model_cache=None,chunk_size=1024) | |
| # def run_get_loss(input_string, model_type): | |
| # # load data | |
| # texts = [input_string] | |
| # print(f'data size: {len(texts)}') | |
| # # load model | |
| # if model_type == 'hf': | |
| # model, tokenizer = load_hf_model(args.model, args.model_cache)# tokenzier path, model path | |
| # elif model_type == 'rwkv': | |
| # model, tokenizer = load_rwkv(args.model) | |
| # elif model_type == 'mamba': | |
| # model, tokenizer = load_mamba(args.model) | |
| # elif model_type == 'rwkv4pile': | |
| # model, tokenizer = load_rwkv4pile(args.model) | |
| # else: | |
| # raise NotImplementedError | |
| # # eval | |
| # if model_type in ['hf', 'mamba']: | |
| # results = eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size) | |
| # elif model_type == 'rwkv': | |
| # results = eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size) | |
| # elif model_type == 'rwkv4pile': | |
| # results = eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size, v4pile=True) | |
| # else: | |
| # raise NotImplementedError | |
| # results['model_name_or_path'] = args.model | |
| # results['data_path'] = args.data | |
| # results['chunk_size'] = args.chunk_size | |
| # make_log(results, args.log_path) | |
| # print(json.dumps(results, indent=4, ensure_ascii=False)) |