codescripts / filter.py
f541119578's picture
Upload folder using huggingface_hub
fdf190d verified
import os
import json
from tqdm import tqdm
names = os.listdir("/home/aiscuser/fhw/data")
fw = open(f"/home/aiscuser/fhw/data/qwq_python_filtered.json",'w+')
#print(len(filenames))
for name in names:
if "QwQ-32B-Preview" not in name:
continue
f = open(f"/home/aiscuser/fhw/data/{name}",'r+')
lines = f.readlines()
for line in tqdm(lines):
d = json.loads(line)
d["instruction"] = d["instruction"].strip('1.').strip(' ').strip('\n').strip('\t')
if len(d["instruction"].split())<200:
fw.write(json.dumps(d)+'\n')
#print(d["instruction"])
#print("########################################################################")
continue
else:
end = d["instruction"].rfind('?')
if end != -1:
d["instruction"] = d["instruction"][:end+1].strip(' ').strip('\n').strip('\t')
if d["instruction"][-1]=='?':
fw.write(json.dumps(d)+'\n')
#print(d["instruction"])
continue
else:
sign = 0
ts = d["instruction"].split('\n')
ts_len = len(ts)
for i in range(ts_len):
if ts[ts_len-1-i].find("How can")!=-1 or ts[ts_len-1-i].find("how can")!=-1 or ts[ts_len-1-i].find("Can you")!=-1 or ts[ts_len-1-i].find("can you")!=-1 or ts[ts_len-1-i].startswith("Please") or ts[ts_len-1-i].startswith("please"):
#print(filename)
sign = 1
d["instruction"] = '\n'.join(ts[0:ts_len-i]).strip(' ').strip('\n')
fw.write(json.dumps(d)+'\n')
#print(d["instruction"])
break
if sign == 1:
continue
else:
top_num = ts_len if ts_len < 5 else 5
for i in range(top_num):
if ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Ensure")!=-1 or ts[i].find("ensure")!=-1 or ts[i].find("Write")!=-1 or ts[i].find("write")!=-1 or ts[i].find("Implement")!=-1 or ts[i].find("implement")!=-1 or ts[i].find("Create")!=-1 or ts[i].find("create")!=-1 or ts[i].find("Explain")!=-1 or ts[i].find("You are")!=-1 or ts[i].find("Given")!=-1 or ts[i].find("Implementing")!=-1 or ts[i].find("implementing")!=-1 or ts[i].find("Writing")!=-1 or ts[i].find("writing")!=-1 or ts[i].find("Creating")!=-1 or ts[i].find("creating")!=-1 or ts[i].find("Design")!=-1 or ts[i].find("design")!=-1 or ts[i].find("Consider")!=-1 or ts[i].find("consider")!=-1 or ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Calculate")!=-1 or ts[i].find("calculate")!=-1:
sign = 1
#print(filename)
d["instruction"] = '\n'.join(ts[0:i+1]).strip(' ').strip('\n')
#print(d["instruction"])
fw.write(json.dumps(d)+'\n')
break
if sign == 1:
continue
else:
end = d["instruction"].find('\n\nSure')
if end == -1:
end = d["instruction"].find('\n\nCertainly')
if end == -1:
end = d["instruction"].find('\n\nHere')
if end == -1:
end = d["instruction"].find('\n\nNow')
if end == -1:
#print(d["instruction"])
#print("########################################################################")
continue
else:
d["instruction"] = d["instruction"][:end].strip(' ').strip('\n').strip('\t')
fw.write(json.dumps(d)+'\n')