codescripts / length.py
f541119578's picture
Upload folder using huggingface_hub
fdf190d verified
raw
history blame contribute delete
331 Bytes
import json
from tqdm import tqdm
f = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json", "r+")
fw = open("/home/aiscuser/fhw/data/qwq_python_length.json", "w+")
lines = f.readlines()
for line in tqdm(lines):
d = json.loads(line)
length = len(d["instruction"].split())
if length <= 500:
fw.write(line)