import json import pprint def read_json(file_path): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) return data def write_json(file_path, data): with open(file_path, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) # data = read_json("DataSet/train_samples_all_tuning.json") data = read_json("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/merged_data.json") from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Model/QwQ-32B-Preview" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) def chat_QwQ(prompt): messages = [ {"role": "system", "content": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, max_new_tokens=512 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # print(response) return response # from transformers import MarianMTModel, MarianTokenizer # model_name = "Model/opus-mt-en-zh" # tokenizer = MarianTokenizer.from_pretrained(model_name) # model = MarianMTModel.from_pretrained(model_name) import argparse parser = argparse.ArgumentParser(description="Process a dataset with specific index range.") parser.add_argument("--index", type=int, required=True, help="Starting index (inclusive).") args = parser.parse_args() index = args.index gap = 1000 save_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/merged_data_QwQ_' + str(index) + '.json' # save_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/merged_data_QwQ.json' from tqdm import tqdm for i in tqdm(range(index*gap, (index+1)*gap)): prompt = data[str(i+1)]['content'] answer = chat_QwQ(prompt) data[str(i+1)]['answer_QwQ'] = answer # english_text = answer # inputs = tokenizer.encode(english_text, return_tensors="pt", truncation=True) # translated = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True) # chinese_translation = tokenizer.decode(translated[0], skip_special_tokens=True) pprint.pprint(prompt) pprint.pprint(answer) # break if i%20 == 0: write_json(save_path, data) write_json(save_path, data)