codescripts / filterans.py
f541119578's picture
Upload folder using huggingface_hub
fdf190d verified
from transformers import AutoTokenizer
import argparse
import json
from tqdm import tqdm
import os
parser = argparse.ArgumentParser()
parser.add_argument('--judgemodel', type=str,help='评判模型')
args = parser.parse_args()
names = os.listdir("/home/aiscuser/fhw/data")
all_lines=[]
modelnames = []
for name in names:
if f"{args.judgemodel}_answerby_" not in name:
continue
else:
print(name)
f = open(f"/home/aiscuser/fhw/data/{name}", 'r+')
lines = f.readlines()
modelname = name.replace(".json","").split("_")[-1]
print(modelname)
modelnames.append(modelname)
all_lines.append(lines)
t = 0
good0 = []
good1 = []
good2 = []
good3 = []
good4 = []
for line0, line1, line2, line3, line4 in tqdm(zip(all_lines[0], all_lines[1], all_lines[2], all_lines[3], all_lines[4])):
d0 = json.loads(line0)
d1 = json.loads(line1)
d2 = json.loads(line2)
d3 = json.loads(line3)
d4 = json.loads(line4)
len0 = len(d0["response"].split(" "))
len1 = len(d1["response"].split(" "))
len2 = len(d2["response"].split(" "))
len3 = len(d3["response"].split(" "))
len4 = len(d4["response"].split(" "))
if len0<=1800 and len0>=3:
good0.append(t)
if len1<=1800 and len1>=3:
good1.append(t)
if len2<=1800 and len2>=3:
good2.append(t)
if len3<=1800 and len3>=3:
good3.append(t)
if len4<=1800 and len4>=3:
good4.append(t)
t = t + 1
fw = open(f"/home/aiscuser/fhw/data/{args.judgemodel}_filtered_by_answer.json","w+")
fw.write(json.dumps({modelnames[0]: good0, modelnames[1]: good1, modelnames[2]: good2, modelnames[3]: good3, modelnames[4]: good4})+"\n")