import json from tqdm import tqdm f = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json", "r+") fw = open("/home/aiscuser/fhw/data/qwq_python_length.json", "w+") lines = f.readlines() for line in tqdm(lines): d = json.loads(line) length = len(d["instruction"].split()) if length <= 500: fw.write(line)