File size: 4,001 Bytes
fdf190d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import json
from tqdm import tqdm
names = os.listdir("/home/aiscuser/fhw/data")
fw = open(f"/home/aiscuser/fhw/data/qwq_python_filtered.json",'w+')
#print(len(filenames))
for name in names:
    if "QwQ-32B-Preview" not in name:
        continue
    f = open(f"/home/aiscuser/fhw/data/{name}",'r+')
    lines = f.readlines()
    for line in tqdm(lines):
        d = json.loads(line)
        d["instruction"] = d["instruction"].strip('1.').strip(' ').strip('\n').strip('\t')
        if len(d["instruction"].split())<200:
            fw.write(json.dumps(d)+'\n')
            #print(d["instruction"])
            #print("########################################################################")
            continue
        else:
            end = d["instruction"].rfind('?')
            if end != -1:
                d["instruction"] = d["instruction"][:end+1].strip(' ').strip('\n').strip('\t')
            if d["instruction"][-1]=='?':
                fw.write(json.dumps(d)+'\n')
                #print(d["instruction"])
                continue
            else:
                sign = 0
                ts = d["instruction"].split('\n')
                ts_len = len(ts)
                for i in range(ts_len):
                    if ts[ts_len-1-i].find("How can")!=-1 or ts[ts_len-1-i].find("how can")!=-1 or ts[ts_len-1-i].find("Can you")!=-1 or ts[ts_len-1-i].find("can you")!=-1 or ts[ts_len-1-i].startswith("Please") or ts[ts_len-1-i].startswith("please"):
                        #print(filename)
                        sign = 1
                        d["instruction"] = '\n'.join(ts[0:ts_len-i]).strip(' ').strip('\n')
                        fw.write(json.dumps(d)+'\n')
                        #print(d["instruction"])
                        break
                if sign == 1:
                    continue
                else:
                    top_num = ts_len if ts_len < 5 else 5
                    for i in range(top_num):
                        if ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Ensure")!=-1 or ts[i].find("ensure")!=-1 or ts[i].find("Write")!=-1 or ts[i].find("write")!=-1 or ts[i].find("Implement")!=-1 or ts[i].find("implement")!=-1 or ts[i].find("Create")!=-1 or ts[i].find("create")!=-1 or ts[i].find("Explain")!=-1 or ts[i].find("You are")!=-1 or ts[i].find("Given")!=-1 or ts[i].find("Implementing")!=-1 or ts[i].find("implementing")!=-1 or ts[i].find("Writing")!=-1 or ts[i].find("writing")!=-1 or ts[i].find("Creating")!=-1 or ts[i].find("creating")!=-1 or ts[i].find("Design")!=-1 or ts[i].find("design")!=-1 or ts[i].find("Consider")!=-1 or ts[i].find("consider")!=-1 or ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Calculate")!=-1 or ts[i].find("calculate")!=-1:
                            sign = 1
                            #print(filename)
                            d["instruction"] = '\n'.join(ts[0:i+1]).strip(' ').strip('\n')
                            #print(d["instruction"])
                            fw.write(json.dumps(d)+'\n')
                            break
                    if sign == 1:
                        continue
                    else:
                        end = d["instruction"].find('\n\nSure')
                        if end == -1:
                            end = d["instruction"].find('\n\nCertainly')
                        if end == -1:
                            end = d["instruction"].find('\n\nHere')
                        if end == -1:
                            end = d["instruction"].find('\n\nNow')
                        if end == -1:
                            #print(d["instruction"])
                            #print("########################################################################")
                            continue
                        else:
                            d["instruction"] = d["instruction"][:end].strip(' ').strip('\n').strip('\t')
                            fw.write(json.dumps(d)+'\n')