|
import os |
|
import json |
|
from tqdm import tqdm |
|
names = os.listdir("/home/aiscuser/fhw/data") |
|
fw = open(f"/home/aiscuser/fhw/data/qwq_python_filtered.json",'w+') |
|
|
|
for name in names: |
|
if "QwQ-32B-Preview" not in name: |
|
continue |
|
f = open(f"/home/aiscuser/fhw/data/{name}",'r+') |
|
lines = f.readlines() |
|
for line in tqdm(lines): |
|
d = json.loads(line) |
|
d["instruction"] = d["instruction"].strip('1.').strip(' ').strip('\n').strip('\t') |
|
if len(d["instruction"].split())<200: |
|
fw.write(json.dumps(d)+'\n') |
|
|
|
|
|
continue |
|
else: |
|
end = d["instruction"].rfind('?') |
|
if end != -1: |
|
d["instruction"] = d["instruction"][:end+1].strip(' ').strip('\n').strip('\t') |
|
if d["instruction"][-1]=='?': |
|
fw.write(json.dumps(d)+'\n') |
|
|
|
continue |
|
else: |
|
sign = 0 |
|
ts = d["instruction"].split('\n') |
|
ts_len = len(ts) |
|
for i in range(ts_len): |
|
if ts[ts_len-1-i].find("How can")!=-1 or ts[ts_len-1-i].find("how can")!=-1 or ts[ts_len-1-i].find("Can you")!=-1 or ts[ts_len-1-i].find("can you")!=-1 or ts[ts_len-1-i].startswith("Please") or ts[ts_len-1-i].startswith("please"): |
|
|
|
sign = 1 |
|
d["instruction"] = '\n'.join(ts[0:ts_len-i]).strip(' ').strip('\n') |
|
fw.write(json.dumps(d)+'\n') |
|
|
|
break |
|
if sign == 1: |
|
continue |
|
else: |
|
top_num = ts_len if ts_len < 5 else 5 |
|
for i in range(top_num): |
|
if ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Ensure")!=-1 or ts[i].find("ensure")!=-1 or ts[i].find("Write")!=-1 or ts[i].find("write")!=-1 or ts[i].find("Implement")!=-1 or ts[i].find("implement")!=-1 or ts[i].find("Create")!=-1 or ts[i].find("create")!=-1 or ts[i].find("Explain")!=-1 or ts[i].find("You are")!=-1 or ts[i].find("Given")!=-1 or ts[i].find("Implementing")!=-1 or ts[i].find("implementing")!=-1 or ts[i].find("Writing")!=-1 or ts[i].find("writing")!=-1 or ts[i].find("Creating")!=-1 or ts[i].find("creating")!=-1 or ts[i].find("Design")!=-1 or ts[i].find("design")!=-1 or ts[i].find("Consider")!=-1 or ts[i].find("consider")!=-1 or ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Calculate")!=-1 or ts[i].find("calculate")!=-1: |
|
sign = 1 |
|
|
|
d["instruction"] = '\n'.join(ts[0:i+1]).strip(' ').strip('\n') |
|
|
|
fw.write(json.dumps(d)+'\n') |
|
break |
|
if sign == 1: |
|
continue |
|
else: |
|
end = d["instruction"].find('\n\nSure') |
|
if end == -1: |
|
end = d["instruction"].find('\n\nCertainly') |
|
if end == -1: |
|
end = d["instruction"].find('\n\nHere') |
|
if end == -1: |
|
end = d["instruction"].find('\n\nNow') |
|
if end == -1: |
|
|
|
|
|
continue |
|
else: |
|
d["instruction"] = d["instruction"][:end].strip(' ').strip('\n').strip('\t') |
|
fw.write(json.dumps(d)+'\n') |
|
|