diff --git "a/evals/core_9mcqa/task-003-arc_challenge-predictions.jsonl" "b/evals/core_9mcqa/task-003-arc_challenge-predictions.jsonl" new file mode 100644--- /dev/null +++ "b/evals/core_9mcqa/task-003-arc_challenge-predictions.jsonl" @@ -0,0 +1,1172 @@ +{"doc_id": 0, "native_id": "Mercury_7175875", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.675500869750977, "incorrect_loss_raw": 22.847471237182617, "correct_loss_per_char": 0.6020972463819716, "incorrect_loss_per_char": 0.6342792909153383, "correct_loss_per_token": 3.0965001242501393, "incorrect_loss_per_token": 3.440660098242381, "correct_loss_uncond": -16.221860885620117, "incorrect_loss_uncond": -12.72264544169108}, "model_output": [{"sum_logits": -22.26869010925293, "num_tokens": 6, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -32.98790740966797, "logits_per_token": -3.711448351542155, "logits_per_char": -0.6748087911894827, "num_chars": 33}, {"sum_logits": -25.626956939697266, "num_tokens": 7, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -39.39399719238281, "logits_per_token": -3.660993848528181, "logits_per_char": -0.7118599149915907, "num_chars": 36}, {"sum_logits": -21.675500869750977, "num_tokens": 7, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -37.897361755371094, "logits_per_token": -3.0965001242501393, "logits_per_char": -0.6020972463819716, "num_chars": 36}, {"sum_logits": -20.646766662597656, "num_tokens": 7, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -34.32844543457031, "logits_per_token": -2.949538094656808, "logits_per_char": -0.5161691665649414, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1, "native_id": "Mercury_SC_409171", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.54775619506836, "incorrect_loss_raw": 18.50825532277425, "correct_loss_per_char": 0.5361295239678745, "incorrect_loss_per_char": 0.5610290369219283, "correct_loss_per_token": 3.1095512390136717, "incorrect_loss_per_token": 3.7016510645548504, "correct_loss_uncond": -14.725950241088867, "incorrect_loss_uncond": -13.495473543802897}, "model_output": [{"sum_logits": -15.206170082092285, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -28.927488327026367, "logits_per_token": -3.041234016418457, "logits_per_char": -0.49052161555136403, "num_chars": 31}, {"sum_logits": -15.54775619506836, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -30.273706436157227, "logits_per_token": -3.1095512390136717, "logits_per_char": -0.5361295239678745, "num_chars": 29}, {"sum_logits": -23.44974136352539, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -37.631317138671875, "logits_per_token": -4.689948272705078, "logits_per_char": -0.7105982231371331, "num_chars": 33}, {"sum_logits": -16.868854522705078, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -29.452381134033203, "logits_per_token": -3.373770904541016, "logits_per_char": -0.48196727207728796, "num_chars": 35}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 2, "native_id": "Mercury_SC_408547", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.100311279296875, "incorrect_loss_raw": 18.38078212738037, "correct_loss_per_char": 0.40638960168716753, "incorrect_loss_per_char": 0.4175431373565608, "correct_loss_per_token": 1.9100311279296875, "incorrect_loss_per_token": 2.2975977659225464, "correct_loss_uncond": -13.046951293945312, "incorrect_loss_uncond": -12.274614651997885}, "model_output": [{"sum_logits": -24.058277130126953, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -35.52030944824219, "logits_per_token": -3.007284641265869, "logits_per_char": -0.5118782368112118, "num_chars": 47}, {"sum_logits": -11.147503852844238, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -26.63791275024414, "logits_per_token": -1.3934379816055298, "logits_per_char": -0.24233704027922257, "num_chars": 46}, {"sum_logits": -19.100311279296875, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.14726257324219, "logits_per_token": -1.9100311279296875, "logits_per_char": -0.40638960168716753, "num_chars": 47}, {"sum_logits": -19.936565399169922, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -29.807968139648438, "logits_per_token": -2.4920706748962402, "logits_per_char": -0.498414134979248, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 3, "native_id": "Mercury_407327", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.381301879882812, "incorrect_loss_raw": 18.201900800069172, "correct_loss_per_char": 0.7020846635867388, "incorrect_loss_per_char": 0.7779346356568513, "correct_loss_per_token": 3.4226627349853516, "incorrect_loss_per_token": 5.173665205637614, "correct_loss_uncond": -6.011161804199219, "incorrect_loss_uncond": -4.516165733337402}, "model_output": [{"sum_logits": -11.127070426940918, "num_tokens": 2, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -13.94599723815918, "logits_per_token": -5.563535213470459, "logits_per_char": -0.9272558689117432, "num_chars": 12}, {"sum_logits": -16.266130447387695, "num_tokens": 3, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -20.19182586669922, "logits_per_token": -5.422043482462565, "logits_per_char": -0.6506452178955078, "num_chars": 25}, {"sum_logits": -27.212501525878906, "num_tokens": 6, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -34.01637649536133, "logits_per_token": -4.535416920979817, "logits_per_char": -0.7559028201633029, "num_chars": 36}, {"sum_logits": -27.381301879882812, "num_tokens": 8, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -33.39246368408203, "logits_per_token": -3.4226627349853516, "logits_per_char": -0.7020846635867388, "num_chars": 39}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 4, "native_id": "MCAS_2006_9_44", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 36.430904388427734, "incorrect_loss_raw": 22.740942001342773, "correct_loss_per_char": 0.5692328810691833, "incorrect_loss_per_char": 0.4781587246045897, "correct_loss_per_token": 2.8023772606482873, "incorrect_loss_per_token": 2.4919654796649886, "correct_loss_uncond": -13.40383529663086, "incorrect_loss_uncond": -15.13626797993978}, "model_output": [{"sum_logits": -24.521060943603516, "num_tokens": 7, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -34.99763107299805, "logits_per_token": -3.503008706229074, "logits_per_char": -0.6811405817667643, "num_chars": 36}, {"sum_logits": -22.468271255493164, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -38.38671875, "logits_per_token": -2.042570114135742, "logits_per_char": -0.3808181568727655, "num_chars": 59}, {"sum_logits": -21.23349380493164, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -40.24728012084961, "logits_per_token": -1.9303176186301492, "logits_per_char": -0.3725174351742393, "num_chars": 57}, {"sum_logits": -36.430904388427734, "num_tokens": 13, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -49.834739685058594, "logits_per_token": -2.8023772606482873, "logits_per_char": -0.5692328810691833, "num_chars": 64}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 5, "native_id": "Mercury_7270393", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 28.44793128967285, "incorrect_loss_raw": 24.281387329101562, "correct_loss_per_char": 0.8367038614609662, "incorrect_loss_per_char": 0.706387661916732, "correct_loss_per_token": 4.063990184238979, "incorrect_loss_per_token": 3.2347721099853515, "correct_loss_uncond": -10.676725387573242, "incorrect_loss_uncond": -7.868589401245117}, "model_output": [{"sum_logits": -32.05989074707031, "num_tokens": 10, "num_tokens_all": 246, "is_greedy": false, "sum_logits_uncond": -42.13330841064453, "logits_per_token": -3.2059890747070314, "logits_per_char": -0.8664835337046031, "num_chars": 37}, {"sum_logits": -28.44793128967285, "num_tokens": 7, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -39.124656677246094, "logits_per_token": -4.063990184238979, "logits_per_char": -0.8367038614609662, "num_chars": 34}, {"sum_logits": -22.113693237304688, "num_tokens": 8, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -28.856277465820312, "logits_per_token": -2.764211654663086, "logits_per_char": -0.6504027422736672, "num_chars": 34}, {"sum_logits": -18.670578002929688, "num_tokens": 5, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -25.460344314575195, "logits_per_token": -3.7341156005859375, "logits_per_char": -0.6022767097719254, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 6, "native_id": "MCAS_2014_5_7", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.29804801940918, "incorrect_loss_raw": 25.117041269938152, "correct_loss_per_char": 0.3803222860608782, "incorrect_loss_per_char": 0.5607781419352097, "correct_loss_per_token": 1.9361861835826526, "incorrect_loss_per_token": 3.083836912592768, "correct_loss_uncond": -18.28087043762207, "incorrect_loss_uncond": -19.448087056477863}, "model_output": [{"sum_logits": -36.14460372924805, "num_tokens": 7, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -48.95124816894531, "logits_per_token": -5.163514818464007, "logits_per_char": -0.9511737823486328, "num_chars": 38}, {"sum_logits": -15.363826751708984, "num_tokens": 8, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -30.957386016845703, "logits_per_token": -1.920478343963623, "logits_per_char": -0.3200797239939372, "num_chars": 48}, {"sum_logits": -21.29804801940918, "num_tokens": 11, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -39.57891845703125, "logits_per_token": -1.9361861835826526, "logits_per_char": -0.3803222860608782, "num_chars": 56}, {"sum_logits": -23.842693328857422, "num_tokens": 11, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -53.78675079345703, "logits_per_token": -2.167517575350675, "logits_per_char": -0.411080919463059, "num_chars": 58}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 7, "native_id": "Mercury_7086660", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 17.156278610229492, "incorrect_loss_raw": 18.971633911132812, "correct_loss_per_char": 0.3989832234937091, "incorrect_loss_per_char": 0.5485534632720911, "correct_loss_per_token": 1.906253178914388, "incorrect_loss_per_token": 2.3982848843569475, "correct_loss_uncond": -13.62110710144043, "incorrect_loss_uncond": -13.389230092366537}, "model_output": [{"sum_logits": -16.726577758789062, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -36.126338958740234, "logits_per_token": -1.8585086398654513, "logits_per_char": -0.42888660919971955, "num_chars": 39}, {"sum_logits": -17.517108917236328, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -30.159934997558594, "logits_per_token": -2.5024441310337613, "logits_per_char": -0.6040382385253906, "num_chars": 29}, {"sum_logits": -17.156278610229492, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -30.777385711669922, "logits_per_token": -1.906253178914388, "logits_per_char": -0.3989832234937091, "num_chars": 43}, {"sum_logits": -22.671215057373047, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -30.79631805419922, "logits_per_token": -2.833901882171631, "logits_per_char": -0.6127355420911634, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 8, "native_id": "Mercury_7168805", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.165802001953125, "incorrect_loss_raw": 21.805208206176758, "correct_loss_per_char": 0.49257337782118055, "incorrect_loss_per_char": 0.4876647301143615, "correct_loss_per_token": 2.4628668891059027, "incorrect_loss_per_token": 2.38666136605399, "correct_loss_uncond": -14.046512603759766, "incorrect_loss_uncond": -14.652607599894205}, "model_output": [{"sum_logits": -14.429838180541992, "num_tokens": 7, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -31.257564544677734, "logits_per_token": -2.061405454363142, "logits_per_char": -0.37973258369847346, "num_chars": 38}, {"sum_logits": -22.165802001953125, "num_tokens": 9, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -36.21231460571289, "logits_per_token": -2.4628668891059027, "logits_per_char": -0.49257337782118055, "num_chars": 45}, {"sum_logits": -23.557863235473633, "num_tokens": 10, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -38.726863861083984, "logits_per_token": -2.3557863235473633, "logits_per_char": -0.5235080718994141, "num_chars": 45}, {"sum_logits": -27.42792320251465, "num_tokens": 10, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -39.38901901245117, "logits_per_token": -2.742792320251465, "logits_per_char": -0.559753534745197, "num_chars": 49}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 9, "native_id": "MCAS_2003_8_11", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.572566509246826, "incorrect_loss_raw": 9.953400293986002, "correct_loss_per_char": 0.7302851676940918, "incorrect_loss_per_char": 0.8379044019629442, "correct_loss_per_token": 3.286283254623413, "incorrect_loss_per_token": 4.976700146993001, "correct_loss_uncond": -8.19692850112915, "incorrect_loss_uncond": -4.681597391764323}, "model_output": [{"sum_logits": -6.572566509246826, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -14.769495010375977, "logits_per_token": -3.286283254623413, "logits_per_char": -0.7302851676940918, "num_chars": 9}, {"sum_logits": -8.773872375488281, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -13.735586166381836, "logits_per_token": -4.386936187744141, "logits_per_char": -0.6749132596529447, "num_chars": 13}, {"sum_logits": -10.314349174499512, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -14.973108291625977, "logits_per_token": -5.157174587249756, "logits_per_char": -0.8595290978749593, "num_chars": 12}, {"sum_logits": -10.771979331970215, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -15.196298599243164, "logits_per_token": -5.385989665985107, "logits_per_char": -0.9792708483609286, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 10, "native_id": "Mercury_7250058", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.662233352661133, "incorrect_loss_raw": 11.665335019429525, "correct_loss_per_char": 0.6860137266271255, "incorrect_loss_per_char": 0.7183767867244147, "correct_loss_per_token": 5.831116676330566, "incorrect_loss_per_token": 5.832667509714763, "correct_loss_uncond": -8.685529708862305, "incorrect_loss_uncond": -8.561281522115072}, "model_output": [{"sum_logits": -12.309082984924316, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -19.973230361938477, "logits_per_token": -6.154541492462158, "logits_per_char": -0.8206055323282878, "num_chars": 15}, {"sum_logits": -11.662233352661133, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -20.347763061523438, "logits_per_token": -5.831116676330566, "logits_per_char": -0.6860137266271255, "num_chars": 17}, {"sum_logits": -9.263240814208984, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -18.912891387939453, "logits_per_token": -4.631620407104492, "logits_per_char": -0.5448965184828815, "num_chars": 17}, {"sum_logits": -13.423681259155273, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.79372787475586, "logits_per_token": -6.711840629577637, "logits_per_char": -0.789628309362075, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 11, "native_id": "Mercury_7012740", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.490421295166016, "incorrect_loss_raw": 22.416207631429035, "correct_loss_per_char": 0.43000979756200036, "incorrect_loss_per_char": 0.6149354679951096, "correct_loss_per_token": 2.311302661895752, "incorrect_loss_per_token": 3.2115652720133467, "correct_loss_uncond": -18.607295989990234, "incorrect_loss_uncond": -13.495679219563803}, "model_output": [{"sum_logits": -18.490421295166016, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -37.09771728515625, "logits_per_token": -2.311302661895752, "logits_per_char": -0.43000979756200036, "num_chars": 43}, {"sum_logits": -21.764923095703125, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -33.529335021972656, "logits_per_token": -3.6274871826171875, "logits_per_char": -0.6595431241122159, "num_chars": 33}, {"sum_logits": -26.48169708251953, "num_tokens": 12, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -43.581356048583984, "logits_per_token": -2.206808090209961, "logits_per_char": -0.4814854015003551, "num_chars": 55}, {"sum_logits": -19.002002716064453, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -30.624969482421875, "logits_per_token": -3.8004005432128904, "logits_per_char": -0.7037778783727575, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 12, "native_id": "Mercury_LBS10610", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.604381561279297, "incorrect_loss_raw": 8.540558815002441, "correct_loss_per_char": 1.600730260213216, "incorrect_loss_per_char": 1.4234264691670735, "correct_loss_per_token": 3.201460520426432, "incorrect_loss_per_token": 2.846852938334147, "correct_loss_uncond": -7.1560516357421875, "incorrect_loss_uncond": -9.331439018249512}, "model_output": [{"sum_logits": -7.692511081695557, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -17.43976402282715, "logits_per_token": -2.5641703605651855, "logits_per_char": -1.2820851802825928, "num_chars": 6}, {"sum_logits": -7.58407735824585, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -18.457822799682617, "logits_per_token": -2.5280257860819497, "logits_per_char": -1.2640128930409749, "num_chars": 6}, {"sum_logits": -9.604381561279297, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -16.760433197021484, "logits_per_token": -3.201460520426432, "logits_per_char": -1.600730260213216, "num_chars": 6}, {"sum_logits": -10.345088005065918, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -17.718406677246094, "logits_per_token": -3.448362668355306, "logits_per_char": -1.724181334177653, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 13, "native_id": "Mercury_SC_407400", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.784500122070312, "incorrect_loss_raw": 17.363550186157227, "correct_loss_per_char": 0.6298333370324337, "incorrect_loss_per_char": 0.5247084856911776, "correct_loss_per_token": 2.969214303152902, "incorrect_loss_per_token": 2.917042891184489, "correct_loss_uncond": -11.608802795410156, "incorrect_loss_uncond": -11.591493606567383}, "model_output": [{"sum_logits": -15.793785095214844, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -22.56316375732422, "logits_per_token": -3.948446273803711, "logits_per_char": -0.6074532728928786, "num_chars": 26}, {"sum_logits": -14.872156143188477, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -24.325695037841797, "logits_per_token": -2.124593734741211, "logits_per_char": -0.4797469723609186, "num_chars": 31}, {"sum_logits": -20.784500122070312, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -32.39330291748047, "logits_per_token": -2.969214303152902, "logits_per_char": -0.6298333370324337, "num_chars": 33}, {"sum_logits": -21.42470932006836, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -39.97627258300781, "logits_per_token": -2.678088665008545, "logits_per_char": -0.4869252118197354, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 14, "native_id": "Mercury_7212993", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 1.380738615989685, "incorrect_loss_raw": 2.687575022379557, "correct_loss_per_char": 0.11506155133247375, "incorrect_loss_per_char": 0.3652788837750753, "correct_loss_per_token": 0.6903693079948425, "incorrect_loss_per_token": 2.4955522219340005, "correct_loss_uncond": -12.698784470558167, "incorrect_loss_uncond": -9.807417551676432}, "model_output": [{"sum_logits": -3.72489595413208, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.461433410644531, "logits_per_token": -3.72489595413208, "logits_per_char": -0.6208159923553467, "num_chars": 6}, {"sum_logits": -3.185692310333252, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.76995849609375, "logits_per_token": -3.185692310333252, "logits_per_char": -0.3982115387916565, "num_chars": 8}, {"sum_logits": -1.380738615989685, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.079523086547852, "logits_per_token": -0.6903693079948425, "logits_per_char": -0.11506155133247375, "num_chars": 12}, {"sum_logits": -1.1521368026733398, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": true, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -0.5760684013366699, "logits_per_char": -0.07680912017822265, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 15, "native_id": "Mercury_SC_413240", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.430915355682373, "incorrect_loss_raw": 7.507252057393392, "correct_loss_per_char": 0.13505085309346518, "incorrect_loss_per_char": 0.37905878937035276, "correct_loss_per_token": 0.48618307113647463, "incorrect_loss_per_token": 1.5014504114786782, "correct_loss_uncond": -13.704027652740479, "incorrect_loss_uncond": -10.881925264994303}, "model_output": [{"sum_logits": -2.430915355682373, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": true, "sum_logits_uncond": -16.13494300842285, "logits_per_token": -0.48618307113647463, "logits_per_char": -0.13505085309346518, "num_chars": 18}, {"sum_logits": -4.213652610778809, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.406259536743164, "logits_per_token": -0.8427305221557617, "logits_per_char": -0.2217711900409899, "num_chars": 19}, {"sum_logits": -7.998207092285156, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.119609832763672, "logits_per_token": -1.5996414184570313, "logits_per_char": -0.39991035461425783, "num_chars": 20}, {"sum_logits": -10.309896469116211, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -22.64166259765625, "logits_per_token": -2.061979293823242, "logits_per_char": -0.5154948234558105, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 16, "native_id": "Mercury_7186358", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.25282096862793, "incorrect_loss_raw": 30.776709874471027, "correct_loss_per_char": 0.5183614870397056, "incorrect_loss_per_char": 0.690482022061528, "correct_loss_per_token": 3.036117281232561, "incorrect_loss_per_token": 4.163792080349393, "correct_loss_uncond": -13.639963150024414, "incorrect_loss_uncond": -15.477938969930014}, "model_output": [{"sum_logits": -29.392803192138672, "num_tokens": 6, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -48.2061882019043, "logits_per_token": -4.898800532023112, "logits_per_char": -0.8397943769182478, "num_chars": 35}, {"sum_logits": -18.8854923248291, "num_tokens": 7, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -34.08778762817383, "logits_per_token": -2.697927474975586, "logits_per_char": -0.4721373081207275, "num_chars": 40}, {"sum_logits": -21.25282096862793, "num_tokens": 7, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -34.892784118652344, "logits_per_token": -3.036117281232561, "logits_per_char": -0.5183614870397056, "num_chars": 41}, {"sum_logits": -44.05183410644531, "num_tokens": 9, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -56.469970703125, "logits_per_token": -4.8946482340494795, "logits_per_char": -0.7595143811456089, "num_chars": 58}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 17, "native_id": "Mercury_7166425", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.418184280395508, "incorrect_loss_raw": 10.464567184448242, "correct_loss_per_char": 0.4340910116831462, "incorrect_loss_per_char": 0.3917804407694983, "correct_loss_per_token": 1.7363640467325847, "incorrect_loss_per_token": 1.7440945307413738, "correct_loss_uncond": -22.599096298217773, "incorrect_loss_uncond": -22.56385103861491}, "model_output": [{"sum_logits": -9.095132827758789, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -32.325218200683594, "logits_per_token": -1.5158554712931316, "logits_per_char": -0.3789638678232829, "num_chars": 24}, {"sum_logits": -10.418184280395508, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -33.01728057861328, "logits_per_token": -1.7363640467325847, "logits_per_char": -0.4340910116831462, "num_chars": 24}, {"sum_logits": -11.482054710388184, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -33.45322799682617, "logits_per_token": -1.9136757850646973, "logits_per_char": -0.4100733825138637, "num_chars": 28}, {"sum_logits": -10.816514015197754, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -33.30680847167969, "logits_per_token": -1.8027523358662922, "logits_per_char": -0.38630407197134836, "num_chars": 28}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 18, "native_id": "MDSA_2007_8_3", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.266042709350586, "incorrect_loss_raw": 17.025399525960285, "correct_loss_per_char": 0.5372365502750173, "incorrect_loss_per_char": 0.4798987409444213, "correct_loss_per_token": 3.044340451558431, "incorrect_loss_per_token": 2.7221085684640065, "correct_loss_uncond": -12.155656814575195, "incorrect_loss_uncond": -15.856905619303385}, "model_output": [{"sum_logits": -18.266042709350586, "num_tokens": 6, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -30.42169952392578, "logits_per_token": -3.044340451558431, "logits_per_char": -0.5372365502750173, "num_chars": 34}, {"sum_logits": -19.48279571533203, "num_tokens": 6, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -31.716773986816406, "logits_per_token": -3.2471326192220054, "logits_per_char": -0.5730234033921185, "num_chars": 34}, {"sum_logits": -17.045692443847656, "num_tokens": 6, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -30.941539764404297, "logits_per_token": -2.840948740641276, "logits_per_char": -0.473491456773546, "num_chars": 36}, {"sum_logits": -14.547710418701172, "num_tokens": 7, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -35.98860168457031, "logits_per_token": -2.0782443455287387, "logits_per_char": -0.39318136266759923, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 19, "native_id": "Mercury_7094290", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.77505874633789, "incorrect_loss_raw": 14.76684284210205, "correct_loss_per_char": 0.5472243980125144, "incorrect_loss_per_char": 0.5165293483143915, "correct_loss_per_token": 4.92501958211263, "incorrect_loss_per_token": 3.012579600016276, "correct_loss_uncond": -7.922435760498047, "incorrect_loss_uncond": -9.961590131123861}, "model_output": [{"sum_logits": -14.809100151062012, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -24.733131408691406, "logits_per_token": -2.4681833585103354, "logits_per_char": -0.5484851907800745, "num_chars": 27}, {"sum_logits": -19.851808547973633, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -28.172744750976562, "logits_per_token": -4.962952136993408, "logits_per_char": -0.7089931624276298, "num_chars": 28}, {"sum_logits": -14.77505874633789, "num_tokens": 3, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -22.697494506835938, "logits_per_token": -4.92501958211263, "logits_per_char": -0.5472243980125144, "num_chars": 27}, {"sum_logits": -9.639619827270508, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -21.279422760009766, "logits_per_token": -1.6066033045450847, "logits_per_char": -0.2921096917354699, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 20, "native_id": "Mercury_7186568", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.125415802001953, "incorrect_loss_raw": 18.43203576405843, "correct_loss_per_char": 0.48216265723818824, "incorrect_loss_per_char": 0.5906040927963039, "correct_loss_per_token": 3.375138600667318, "incorrect_loss_per_token": 4.491412291451106, "correct_loss_uncond": -15.918401718139648, "incorrect_loss_uncond": -8.445340156555176}, "model_output": [{"sum_logits": -14.344935417175293, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -21.81698989868164, "logits_per_token": -7.1724677085876465, "logits_per_char": -0.7969408565097384, "num_chars": 18}, {"sum_logits": -10.125415802001953, "num_tokens": 3, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -26.0438175201416, "logits_per_token": -3.375138600667318, "logits_per_char": -0.48216265723818824, "num_chars": 21}, {"sum_logits": -18.967273712158203, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -31.489683151245117, "logits_per_token": -3.1612122853597007, "logits_per_char": -0.48634035159380007, "num_chars": 39}, {"sum_logits": -21.983898162841797, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -27.325454711914062, "logits_per_token": -3.140556880405971, "logits_per_char": -0.48853107028537324, "num_chars": 45}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 21, "native_id": "Mercury_402216", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.662303924560547, "incorrect_loss_raw": 22.1361821492513, "correct_loss_per_char": 0.5903515407017299, "incorrect_loss_per_char": 1.1327749472398025, "correct_loss_per_token": 0.8609293301900228, "incorrect_loss_per_token": 1.7332507533509958, "correct_loss_uncond": -27.911823272705078, "incorrect_loss_uncond": -21.684703826904297}, "model_output": [{"sum_logits": -18.106033325195312, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -38.03190994262695, "logits_per_token": -1.6460030295632102, "logits_per_char": -0.9053016662597656, "num_chars": 20}, {"sum_logits": -20.662303924560547, "num_tokens": 24, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -48.574127197265625, "logits_per_token": -0.8609293301900228, "logits_per_char": -0.5903515407017299, "num_chars": 35}, {"sum_logits": -26.425743103027344, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -44.57533264160156, "logits_per_token": -2.402340282093395, "logits_per_char": -1.651608943939209, "num_chars": 16}, {"sum_logits": -21.87677001953125, "num_tokens": 19, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -48.85541534423828, "logits_per_token": -1.1514089483963816, "logits_per_char": -0.8414142315204327, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 22, "native_id": "Mercury_404894", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.679384231567383, "incorrect_loss_raw": 17.652037302652996, "correct_loss_per_char": 0.42550662468219624, "incorrect_loss_per_char": 0.5190481315415654, "correct_loss_per_token": 2.243580384687944, "incorrect_loss_per_token": 2.5007654401991104, "correct_loss_uncond": -10.926214218139648, "incorrect_loss_uncond": -8.714946111043295}, "model_output": [{"sum_logits": -24.679384231567383, "num_tokens": 11, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.60559844970703, "logits_per_token": -2.243580384687944, "logits_per_char": -0.42550662468219624, "num_chars": 58}, {"sum_logits": -16.96483612060547, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -27.765949249267578, "logits_per_token": -2.827472686767578, "logits_per_char": -0.5140859430486505, "num_chars": 33}, {"sum_logits": -19.8558349609375, "num_tokens": 10, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.914886474609375, "logits_per_token": -1.98558349609375, "logits_per_char": -0.4224645736369681, "num_chars": 47}, {"sum_logits": -16.135440826416016, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -24.420114517211914, "logits_per_token": -2.6892401377360025, "logits_per_char": -0.6205938779390775, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 23, "native_id": "MCAS_2002_8_11", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.826766490936279, "incorrect_loss_raw": 6.557829221089681, "correct_loss_per_char": 0.9711277484893799, "incorrect_loss_per_char": 1.0929715368482802, "correct_loss_per_token": 1.4566916227340698, "incorrect_loss_per_token": 1.6394573052724202, "correct_loss_uncond": -12.814518451690674, "incorrect_loss_uncond": -12.445319175720215}, "model_output": [{"sum_logits": -7.661252021789551, "num_tokens": 4, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -19.784141540527344, "logits_per_token": -1.9153130054473877, "logits_per_char": -1.276875336964925, "num_chars": 6}, {"sum_logits": -5.399744987487793, "num_tokens": 4, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -18.331754684448242, "logits_per_token": -1.3499362468719482, "logits_per_char": -0.8999574979146322, "num_chars": 6}, {"sum_logits": -5.826766490936279, "num_tokens": 4, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -18.641284942626953, "logits_per_token": -1.4566916227340698, "logits_per_char": -0.9711277484893799, "num_chars": 6}, {"sum_logits": -6.612490653991699, "num_tokens": 4, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -18.8935489654541, "logits_per_token": -1.6531226634979248, "logits_per_char": -1.1020817756652832, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 24, "native_id": "Mercury_SC_405086", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.702328681945801, "incorrect_loss_raw": 7.180191993713379, "correct_loss_per_char": 0.6717612402779716, "incorrect_loss_per_char": 1.266669499874115, "correct_loss_per_token": 2.3511643409729004, "incorrect_loss_per_token": 3.5900959968566895, "correct_loss_uncond": -7.571314811706543, "incorrect_loss_uncond": -7.862897872924805}, "model_output": [{"sum_logits": -11.377274513244629, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.733966827392578, "logits_per_token": -5.6886372566223145, "logits_per_char": -2.2754549026489257, "num_chars": 5}, {"sum_logits": -4.702328681945801, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.273643493652344, "logits_per_token": -2.3511643409729004, "logits_per_char": -0.6717612402779716, "num_chars": 7}, {"sum_logits": -6.099381923675537, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.521187782287598, "logits_per_token": -3.0496909618377686, "logits_per_char": -1.0165636539459229, "num_chars": 6}, {"sum_logits": -4.063919544219971, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.874114990234375, "logits_per_token": -2.0319597721099854, "logits_per_char": -0.5079899430274963, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 25, "native_id": "Mercury_SC_408324", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.037607192993164, "incorrect_loss_raw": 10.470067342122396, "correct_loss_per_char": 0.47566353647332443, "incorrect_loss_per_char": 0.679415442006372, "correct_loss_per_token": 1.807521438598633, "incorrect_loss_per_token": 2.9064940134684245, "correct_loss_uncond": -15.99212646484375, "incorrect_loss_uncond": -12.003995259602865}, "model_output": [{"sum_logits": -9.913777351379395, "num_tokens": 3, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -22.510234832763672, "logits_per_token": -3.3045924504597983, "logits_per_char": -0.7081269536699567, "num_chars": 14}, {"sum_logits": -8.367034912109375, "num_tokens": 3, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -19.3746337890625, "logits_per_token": -2.7890116373697915, "logits_per_char": -0.5578023274739583, "num_chars": 15}, {"sum_logits": -13.129389762878418, "num_tokens": 5, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -25.53731918334961, "logits_per_token": -2.6258779525756837, "logits_per_char": -0.772317044875201, "num_chars": 17}, {"sum_logits": -9.037607192993164, "num_tokens": 5, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -25.029733657836914, "logits_per_token": -1.807521438598633, "logits_per_char": -0.47566353647332443, "num_chars": 19}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 26, "native_id": "Mercury_7218820", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 33.76005554199219, "incorrect_loss_raw": 34.01576487223307, "correct_loss_per_char": 0.5534435334752817, "incorrect_loss_per_char": 0.535436984903212, "correct_loss_per_token": 3.3760055541992187, "incorrect_loss_per_token": 2.8472231214008636, "correct_loss_uncond": -11.186756134033203, "incorrect_loss_uncond": -10.846627553304037}, "model_output": [{"sum_logits": -29.079212188720703, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -37.519439697265625, "logits_per_token": -2.9079212188720702, "logits_per_char": -0.5385039294207538, "num_chars": 54}, {"sum_logits": -33.76005554199219, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -44.94681167602539, "logits_per_token": -3.3760055541992187, "logits_per_char": -0.5534435334752817, "num_chars": 61}, {"sum_logits": -37.54173278808594, "num_tokens": 14, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -47.07902908325195, "logits_per_token": -2.6815523420061385, "logits_per_char": -0.5142703121655607, "num_chars": 73}, {"sum_logits": -35.42634963989258, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -49.98870849609375, "logits_per_token": -2.9521958033243814, "logits_per_char": -0.5535367131233215, "num_chars": 64}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 27, "native_id": "Mercury_412202", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.181865692138672, "incorrect_loss_raw": 10.408934593200684, "correct_loss_per_char": 0.3855815755909887, "incorrect_loss_per_char": 0.35892877907588566, "correct_loss_per_token": 1.397733211517334, "incorrect_loss_per_token": 1.3011168241500854, "correct_loss_uncond": -15.689550399780273, "incorrect_loss_uncond": -16.217089653015137}, "model_output": [{"sum_logits": -8.388486862182617, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -22.36258888244629, "logits_per_token": -1.0485608577728271, "logits_per_char": -0.28925816766146956, "num_chars": 29}, {"sum_logits": -11.181865692138672, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -26.871416091918945, "logits_per_token": -1.397733211517334, "logits_per_char": -0.3855815755909887, "num_chars": 29}, {"sum_logits": -11.995607376098633, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -28.508983612060547, "logits_per_token": -1.499450922012329, "logits_per_char": -0.41364163365857354, "num_chars": 29}, {"sum_logits": -10.8427095413208, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -29.006500244140625, "logits_per_token": -1.3553386926651, "logits_per_char": -0.37388653590761384, "num_chars": 29}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 28, "native_id": "Mercury_SC_409139", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.895698547363281, "incorrect_loss_raw": 13.231450716654459, "correct_loss_per_char": 0.4358279418945312, "incorrect_loss_per_char": 0.5222918142044425, "correct_loss_per_token": 2.1791397094726563, "incorrect_loss_per_token": 2.5174174202813044, "correct_loss_uncond": -19.06521987915039, "incorrect_loss_uncond": -16.873562177022297}, "model_output": [{"sum_logits": -15.115436553955078, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -29.614559173583984, "logits_per_token": -3.0230873107910154, "logits_per_char": -0.6571928936502208, "num_chars": 23}, {"sum_logits": -11.59854507446289, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -31.029870986938477, "logits_per_token": -1.9330908457438152, "logits_per_char": -0.4460978874793419, "num_chars": 26}, {"sum_logits": -10.895698547363281, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -29.960918426513672, "logits_per_token": -2.1791397094726563, "logits_per_char": -0.4358279418945312, "num_chars": 25}, {"sum_logits": -12.98037052154541, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -29.670608520507812, "logits_per_token": -2.596074104309082, "logits_per_char": -0.46358466148376465, "num_chars": 28}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 29, "native_id": "Mercury_400687", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.818939208984375, "incorrect_loss_raw": 16.569152514139812, "correct_loss_per_char": 0.7540552475873161, "incorrect_loss_per_char": 1.0122167061876368, "correct_loss_per_token": 2.563787841796875, "incorrect_loss_per_token": 4.142288128534953, "correct_loss_uncond": -12.503829956054688, "incorrect_loss_uncond": -5.861043612162272}, "model_output": [{"sum_logits": -15.963772773742676, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -20.841400146484375, "logits_per_token": -3.990943193435669, "logits_per_char": -0.9977357983589172, "num_chars": 16}, {"sum_logits": -12.818939208984375, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -25.322769165039062, "logits_per_token": -2.563787841796875, "logits_per_char": -0.7540552475873161, "num_chars": 17}, {"sum_logits": -14.783864974975586, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -21.612083435058594, "logits_per_token": -3.6959662437438965, "logits_per_char": -0.9855909983317057, "num_chars": 15}, {"sum_logits": -18.959819793701172, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -24.83710479736328, "logits_per_token": -4.739954948425293, "logits_per_char": -1.0533233218722873, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 30, "native_id": "Mercury_7171605", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.886160850524902, "incorrect_loss_raw": 14.425444920857748, "correct_loss_per_char": 0.29973888397216797, "incorrect_loss_per_char": 0.5361914388678483, "correct_loss_per_token": 1.765128983391656, "incorrect_loss_per_token": 2.872709902506026, "correct_loss_uncond": -17.47287082672119, "incorrect_loss_uncond": -15.727174123128256}, "model_output": [{"sum_logits": -19.56128692626953, "num_tokens": 4, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -35.21821975708008, "logits_per_token": -4.890321731567383, "logits_per_char": -0.8504907359247622, "num_chars": 23}, {"sum_logits": -9.437399864196777, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -28.462644577026367, "logits_per_token": -1.3481999805995397, "logits_per_char": -0.3254275815240268, "num_chars": 29}, {"sum_logits": -14.277647972106934, "num_tokens": 6, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -26.776992797851562, "logits_per_token": -2.3796079953511557, "logits_per_char": -0.43265599915475556, "num_chars": 33}, {"sum_logits": -15.886160850524902, "num_tokens": 9, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -33.359031677246094, "logits_per_token": -1.765128983391656, "logits_per_char": -0.29973888397216797, "num_chars": 53}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 31, "native_id": "Mercury_7210245", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.890512466430664, "incorrect_loss_raw": 9.534676869710287, "correct_loss_per_char": 0.2694094686797171, "incorrect_loss_per_char": 0.38734556007599324, "correct_loss_per_token": 1.2700732094900948, "incorrect_loss_per_token": 2.3836692174275718, "correct_loss_uncond": -25.28636360168457, "incorrect_loss_uncond": -16.42104975382487}, "model_output": [{"sum_logits": -9.065113067626953, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -25.89482307434082, "logits_per_token": -2.2662782669067383, "logits_per_char": -0.41205059398304333, "num_chars": 22}, {"sum_logits": -10.655082702636719, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -24.862037658691406, "logits_per_token": -2.6637706756591797, "logits_per_char": -0.39463269269024887, "num_chars": 27}, {"sum_logits": -8.890512466430664, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -34.176876068115234, "logits_per_token": -1.2700732094900948, "logits_per_char": -0.2694094686797171, "num_chars": 33}, {"sum_logits": -8.883834838867188, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -27.110319137573242, "logits_per_token": -2.220958709716797, "logits_per_char": -0.3553533935546875, "num_chars": 25}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 32, "native_id": "AKDE&ED_2008_4_25", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 13.803559303283691, "incorrect_loss_raw": 16.431479771931965, "correct_loss_per_char": 0.5112429371586552, "incorrect_loss_per_char": 0.8326516757430421, "correct_loss_per_token": 1.9719370433262415, "incorrect_loss_per_token": 2.9350797562372115, "correct_loss_uncond": -4.826424598693848, "incorrect_loss_uncond": -4.5597349802653}, "model_output": [{"sum_logits": -13.803559303283691, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -18.62998390197754, "logits_per_token": -1.9719370433262415, "logits_per_char": -0.5112429371586552, "num_chars": 27}, {"sum_logits": -18.43885040283203, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.65256690979004, "logits_per_token": -2.6341214861188615, "logits_per_char": -0.6829203852900753, "num_chars": 27}, {"sum_logits": -15.284749031066895, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -20.092971801757812, "logits_per_token": -3.056949806213379, "logits_per_char": -0.8991028841804055, "num_chars": 17}, {"sum_logits": -15.570839881896973, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -19.228105545043945, "logits_per_token": -3.1141679763793944, "logits_per_char": -0.9159317577586454, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 33, "native_id": "AKDE&ED_2008_4_19", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.292997360229492, "incorrect_loss_raw": 21.883809407552082, "correct_loss_per_char": 0.39097051059498505, "incorrect_loss_per_char": 0.6560216181987041, "correct_loss_per_token": 1.8989996228899275, "incorrect_loss_per_token": 2.88007981688888, "correct_loss_uncond": -13.552177429199219, "incorrect_loss_uncond": -6.150561650594075}, "model_output": [{"sum_logits": -18.16305923461914, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.954259872436523, "logits_per_token": -3.02717653910319, "logits_per_char": -0.6486806869506836, "num_chars": 28}, {"sum_logits": -23.254791259765625, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.932884216308594, "logits_per_token": -2.583865695529514, "logits_per_char": -0.664422607421875, "num_chars": 35}, {"sum_logits": -13.292997360229492, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.84517478942871, "logits_per_token": -1.8989996228899275, "logits_per_char": -0.39097051059498505, "num_chars": 34}, {"sum_logits": -24.233577728271484, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -33.21596908569336, "logits_per_token": -3.0291972160339355, "logits_per_char": -0.6549615602235537, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 34, "native_id": "Mercury_SC_400402", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.1313276290893555, "incorrect_loss_raw": 8.68324089050293, "correct_loss_per_char": 0.7664159536361694, "incorrect_loss_per_char": 0.8907752071108138, "correct_loss_per_token": 3.0656638145446777, "incorrect_loss_per_token": 3.788485897911919, "correct_loss_uncond": -9.408588409423828, "incorrect_loss_uncond": -7.475508371988933}, "model_output": [{"sum_logits": -6.1313276290893555, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.539916038513184, "logits_per_token": -3.0656638145446777, "logits_per_char": -0.7664159536361694, "num_chars": 8}, {"sum_logits": -9.956421852111816, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.760405540466309, "logits_per_token": -3.318807284037272, "logits_per_char": -1.244552731513977, "num_chars": 8}, {"sum_logits": -9.738799095153809, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.517102241516113, "logits_per_token": -4.869399547576904, "logits_per_char": -0.9738799095153808, "num_chars": 10}, {"sum_logits": -6.354501724243164, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.198740005493164, "logits_per_token": -3.177250862121582, "logits_per_char": -0.45389298030308317, "num_chars": 14}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 35, "native_id": "Mercury_7234308", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.72646141052246, "incorrect_loss_raw": 33.578704833984375, "correct_loss_per_char": 0.7006454002566453, "incorrect_loss_per_char": 0.5833880667850889, "correct_loss_per_token": 3.5908076763153076, "incorrect_loss_per_token": 2.971563360426161, "correct_loss_uncond": -7.977685928344727, "incorrect_loss_uncond": -17.602723439534504}, "model_output": [{"sum_logits": -28.72646141052246, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -36.70414733886719, "logits_per_token": -3.5908076763153076, "logits_per_char": -0.7006454002566453, "num_chars": 41}, {"sum_logits": -30.36141586303711, "num_tokens": 12, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -45.71535873413086, "logits_per_token": -2.530117988586426, "logits_per_char": -0.5234726872937433, "num_chars": 58}, {"sum_logits": -31.20083236694336, "num_tokens": 10, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -48.111083984375, "logits_per_token": -3.120083236694336, "logits_per_char": -0.6240166473388672, "num_chars": 50}, {"sum_logits": -39.173866271972656, "num_tokens": 12, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -59.71784210205078, "logits_per_token": -3.264488855997721, "logits_per_char": -0.6026748657226563, "num_chars": 65}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 36, "native_id": "ACTAAP_2014_5_8", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.100618362426758, "incorrect_loss_raw": 30.062929153442383, "correct_loss_per_char": 0.5196538993290493, "incorrect_loss_per_char": 0.5280783484811401, "correct_loss_per_token": 2.2385091048020582, "incorrect_loss_per_token": 2.446331545732066, "correct_loss_uncond": -16.35645866394043, "incorrect_loss_uncond": -17.436588923136394}, "model_output": [{"sum_logits": -27.704988479614258, "num_tokens": 11, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -45.212825775146484, "logits_per_token": -2.518635316328569, "logits_per_char": -0.5432350682277306, "num_chars": 51}, {"sum_logits": -29.100618362426758, "num_tokens": 13, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -45.45707702636719, "logits_per_token": -2.2385091048020582, "logits_per_char": -0.5196538993290493, "num_chars": 56}, {"sum_logits": -30.007389068603516, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -46.876651763916016, "logits_per_token": -2.5006157557169595, "logits_per_char": -0.5085998147220935, "num_chars": 59}, {"sum_logits": -32.476409912109375, "num_tokens": 14, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -50.40907669067383, "logits_per_token": -2.3197435651506697, "logits_per_char": -0.5324001624935963, "num_chars": 61}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 37, "native_id": "Mercury_400407", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.957138061523438, "incorrect_loss_raw": 19.55874760945638, "correct_loss_per_char": 0.5599301966225229, "incorrect_loss_per_char": 0.40597120636176776, "correct_loss_per_token": 2.5507931179470487, "incorrect_loss_per_token": 1.9635180771952925, "correct_loss_uncond": -9.458057403564453, "incorrect_loss_uncond": -10.920551300048828}, "model_output": [{"sum_logits": -20.532955169677734, "num_tokens": 9, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.65473175048828, "logits_per_token": -2.281439463297526, "logits_per_char": -0.5264860299917368, "num_chars": 39}, {"sum_logits": -22.957138061523438, "num_tokens": 9, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.41519546508789, "logits_per_token": -2.5507931179470487, "logits_per_char": -0.5599301966225229, "num_chars": 41}, {"sum_logits": -15.569747924804688, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -26.560359954833984, "logits_per_token": -1.5569747924804687, "logits_per_char": -0.2883286652741609, "num_chars": 54}, {"sum_logits": -22.57353973388672, "num_tokens": 11, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -32.22280502319336, "logits_per_token": -2.0521399758078833, "logits_per_char": -0.4030989238194057, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 38, "native_id": "Mercury_7116288", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.42192554473877, "incorrect_loss_raw": 19.04940414428711, "correct_loss_per_char": 0.27391684785181164, "incorrect_loss_per_char": 0.4356876774053338, "correct_loss_per_token": 1.6777406930923462, "incorrect_loss_per_token": 2.564115197317941, "correct_loss_uncond": -24.105830192565918, "incorrect_loss_uncond": -13.837158838907877}, "model_output": [{"sum_logits": -16.629730224609375, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -26.51915168762207, "logits_per_token": -2.7716217041015625, "logits_per_char": -0.4494521682326858, "num_chars": 37}, {"sum_logits": -20.273765563964844, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -36.12240219116211, "logits_per_token": -2.896252223423549, "logits_per_char": -0.48270870390392484, "num_chars": 42}, {"sum_logits": -13.42192554473877, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -37.52775573730469, "logits_per_token": -1.6777406930923462, "logits_per_char": -0.27391684785181164, "num_chars": 49}, {"sum_logits": -20.24471664428711, "num_tokens": 10, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -36.01813507080078, "logits_per_token": -2.024471664428711, "logits_per_char": -0.3749021600793909, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 39, "native_id": "MCAS_2004_9_15-v1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.292884826660156, "incorrect_loss_raw": 20.117848714192707, "correct_loss_per_char": 0.2294772510797205, "incorrect_loss_per_char": 0.2537621734759001, "correct_loss_per_token": 1.253298832820012, "incorrect_loss_per_token": 1.292099222526942, "correct_loss_uncond": -22.276912689208984, "incorrect_loss_uncond": -21.08947499593099}, "model_output": [{"sum_logits": -15.790380477905273, "num_tokens": 14, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -37.32750701904297, "logits_per_token": -1.1278843198503767, "logits_per_char": -0.23221147761625402, "num_chars": 68}, {"sum_logits": -16.292884826660156, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -38.56979751586914, "logits_per_token": -1.253298832820012, "logits_per_char": -0.2294772510797205, "num_chars": 71}, {"sum_logits": -16.198959350585938, "num_tokens": 15, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -37.43162536621094, "logits_per_token": -1.0799306233723958, "logits_per_char": -0.21037609546215502, "num_chars": 77}, {"sum_logits": -28.364206314086914, "num_tokens": 17, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -48.86283874511719, "logits_per_token": -1.6684827243580538, "logits_per_char": -0.31869894734929116, "num_chars": 89}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 40, "native_id": "NYSEDREGENTS_2015_4_26", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.000960350036621, "incorrect_loss_raw": 14.14650853474935, "correct_loss_per_char": 0.3333653450012207, "incorrect_loss_per_char": 0.5967986172451561, "correct_loss_per_token": 2.5002400875091553, "incorrect_loss_per_token": 3.5724089940389, "correct_loss_uncond": -17.107409477233887, "incorrect_loss_uncond": -12.342610677083334}, "model_output": [{"sum_logits": -13.51992416381836, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -28.71661949157715, "logits_per_token": -3.37998104095459, "logits_per_char": -0.519997083223783, "num_chars": 26}, {"sum_logits": -17.2696590423584, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -26.530597686767578, "logits_per_token": -3.4539318084716797, "logits_per_char": -0.7849845019253817, "num_chars": 22}, {"sum_logits": -10.000960350036621, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -27.108369827270508, "logits_per_token": -2.5002400875091553, "logits_per_char": -0.3333653450012207, "num_chars": 30}, {"sum_logits": -11.649942398071289, "num_tokens": 3, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -24.22014045715332, "logits_per_token": -3.8833141326904297, "logits_per_char": -0.4854142665863037, "num_chars": 24}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 41, "native_id": "Mercury_SC_401620", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.329099655151367, "incorrect_loss_raw": 15.776727358500162, "correct_loss_per_char": 0.7080687284469604, "incorrect_loss_per_char": 0.5680126809057855, "correct_loss_per_token": 2.832274913787842, "incorrect_loss_per_token": 2.5111435733774985, "correct_loss_uncond": -11.977792739868164, "incorrect_loss_uncond": -11.149867057800293}, "model_output": [{"sum_logits": -11.329099655151367, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -23.30689239501953, "logits_per_token": -2.832274913787842, "logits_per_char": -0.7080687284469604, "num_chars": 16}, {"sum_logits": -20.06793975830078, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -30.339982986450195, "logits_per_token": -4.013587951660156, "logits_per_char": -0.9556161789667039, "num_chars": 21}, {"sum_logits": -15.457199096679688, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.413612365722656, "logits_per_token": -2.2081712995256697, "logits_per_char": -0.42936664157443577, "num_chars": 36}, {"sum_logits": -11.80504322052002, "num_tokens": 9, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -24.026187896728516, "logits_per_token": -1.3116714689466689, "logits_per_char": -0.31905522217621674, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 42, "native_id": "Mercury_400877", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.634039402008057, "incorrect_loss_raw": 3.8907403548558555, "correct_loss_per_char": 1.5446798006693523, "incorrect_loss_per_char": 1.5812623037232292, "correct_loss_per_token": 4.634039402008057, "incorrect_loss_per_token": 3.8907403548558555, "correct_loss_uncond": -3.1566224098205566, "incorrect_loss_uncond": -3.260266343752543}, "model_output": [{"sum_logits": -1.5695956945419312, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": true, "sum_logits_uncond": -6.421551704406738, "logits_per_token": -1.5695956945419312, "logits_per_char": -0.7847978472709656, "num_chars": 2}, {"sum_logits": -3.5486836433410645, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -6.373539924621582, "logits_per_token": -3.5486836433410645, "logits_per_char": -1.7743418216705322, "num_chars": 2}, {"sum_logits": -4.634039402008057, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -7.790661811828613, "logits_per_token": -4.634039402008057, "logits_per_char": -1.5446798006693523, "num_chars": 3}, {"sum_logits": -6.55394172668457, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -8.657928466796875, "logits_per_token": -6.55394172668457, "logits_per_char": -2.18464724222819, "num_chars": 3}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 43, "native_id": "Mercury_7174213", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.970107078552246, "incorrect_loss_raw": 10.288594404856363, "correct_loss_per_char": 0.35489126841227214, "incorrect_loss_per_char": 0.3386118569844769, "correct_loss_per_token": 2.2814438683646068, "incorrect_loss_per_token": 1.859883424970839, "correct_loss_uncond": -14.375046730041504, "incorrect_loss_uncond": -12.16971222559611}, "model_output": [{"sum_logits": -5.916347503662109, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -18.740760803222656, "logits_per_token": -1.1832695007324219, "logits_per_char": -0.2275518270639273, "num_chars": 26}, {"sum_logits": -7.14424467086792, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -20.721893310546875, "logits_per_token": -1.428848934173584, "logits_per_char": -0.2646016544765896, "num_chars": 27}, {"sum_logits": -17.805191040039062, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -27.91226577758789, "logits_per_token": -2.9675318400065103, "logits_per_char": -0.5236820894129136, "num_chars": 34}, {"sum_logits": -15.970107078552246, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.34515380859375, "logits_per_token": -2.2814438683646068, "logits_per_char": -0.35489126841227214, "num_chars": 45}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 44, "native_id": "NYSEDREGENTS_2008_8_34", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.947420120239258, "incorrect_loss_raw": 16.310715357462566, "correct_loss_per_char": 0.22714772140770628, "incorrect_loss_per_char": 0.31090265030626335, "correct_loss_per_token": 1.4386022355821397, "incorrect_loss_per_token": 1.978846894370185, "correct_loss_uncond": -14.548910140991211, "incorrect_loss_uncond": -12.447850545247396}, "model_output": [{"sum_logits": -12.958385467529297, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.881223678588867, "logits_per_token": -1.4398206075032551, "logits_per_char": -0.22734009592156662, "num_chars": 57}, {"sum_logits": -12.947420120239258, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -27.49633026123047, "logits_per_token": -1.4386022355821397, "logits_per_char": -0.22714772140770628, "num_chars": 57}, {"sum_logits": -18.83346939086914, "num_tokens": 8, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -30.363243103027344, "logits_per_token": -2.3541836738586426, "logits_per_char": -0.36928371354645373, "num_chars": 51}, {"sum_logits": -17.140291213989258, "num_tokens": 8, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -29.031230926513672, "logits_per_token": -2.1425364017486572, "logits_per_char": -0.33608414145076976, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 45, "native_id": "Mercury_7212398", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.012997627258301, "incorrect_loss_raw": 9.374249140421549, "correct_loss_per_char": 1.1688329378763835, "incorrect_loss_per_char": 1.1275712929519952, "correct_loss_per_token": 3.5064988136291504, "incorrect_loss_per_token": 3.718783802456326, "correct_loss_uncond": -7.852927207946777, "incorrect_loss_uncond": -7.056926091512044}, "model_output": [{"sum_logits": -10.69261360168457, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.0299072265625, "logits_per_token": -5.346306800842285, "logits_per_char": -2.138522720336914, "num_chars": 5}, {"sum_logits": -7.012997627258301, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.865924835205078, "logits_per_token": -3.5064988136291504, "logits_per_char": -1.1688329378763835, "num_chars": 6}, {"sum_logits": -5.315879821777344, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.713462829589844, "logits_per_token": -1.771959940592448, "logits_per_char": -0.5315879821777344, "num_chars": 10}, {"sum_logits": -12.114253997802734, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.550155639648438, "logits_per_token": -4.038084665934245, "logits_per_char": -0.7126031763413373, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 46, "native_id": "Mercury_SC_401290", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.8594274520874023, "incorrect_loss_raw": 7.067714850107829, "correct_loss_per_char": 0.3859427452087402, "incorrect_loss_per_char": 1.083895742893219, "correct_loss_per_token": 3.8594274520874023, "incorrect_loss_per_token": 7.067714850107829, "correct_loss_uncond": -9.212464332580566, "incorrect_loss_uncond": -6.432722568511963}, "model_output": [{"sum_logits": -5.776609420776367, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.38298225402832, "logits_per_token": -5.776609420776367, "logits_per_char": -0.9627682367960612, "num_chars": 6}, {"sum_logits": -6.772084712982178, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.185483932495117, "logits_per_token": -6.772084712982178, "logits_per_char": -0.8465105891227722, "num_chars": 8}, {"sum_logits": -3.8594274520874023, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.071891784667969, "logits_per_token": -3.8594274520874023, "logits_per_char": -0.3859427452087402, "num_chars": 10}, {"sum_logits": -8.654450416564941, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -11.932846069335938, "logits_per_token": -8.654450416564941, "logits_per_char": -1.4424084027608235, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 47, "native_id": "Mercury_SC_402120", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.8608622550964355, "incorrect_loss_raw": 6.488386392593384, "correct_loss_per_char": 0.48608622550964353, "incorrect_loss_per_char": 0.6968223783704969, "correct_loss_per_token": 2.4304311275482178, "incorrect_loss_per_token": 4.487926046053569, "correct_loss_uncond": -12.130299091339111, "incorrect_loss_uncond": -9.01822034517924}, "model_output": [{"sum_logits": -7.46239709854126, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -13.03304672241211, "logits_per_token": -7.46239709854126, "logits_per_char": -0.9327996373176575, "num_chars": 8}, {"sum_logits": -9.446239471435547, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -18.008636474609375, "logits_per_token": -4.723119735717773, "logits_per_char": -0.9446239471435547, "num_chars": 10}, {"sum_logits": -4.8608622550964355, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.991161346435547, "logits_per_token": -2.4304311275482178, "logits_per_char": -0.48608622550964353, "num_chars": 10}, {"sum_logits": -2.5565226078033447, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -15.478137016296387, "logits_per_token": -1.2782613039016724, "logits_per_char": -0.21304355065027872, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 48, "native_id": "Mercury_184975", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.349700927734375, "incorrect_loss_raw": 12.966076215108236, "correct_loss_per_char": 0.6977136785333807, "incorrect_loss_per_char": 0.5828386509057247, "correct_loss_per_token": 5.116566975911458, "incorrect_loss_per_token": 3.7492773903740777, "correct_loss_uncond": -8.766019821166992, "incorrect_loss_uncond": -10.675995190938314}, "model_output": [{"sum_logits": -16.101211547851562, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.53679656982422, "logits_per_token": -5.3670705159505205, "logits_per_char": -0.731873252175071, "num_chars": 22}, {"sum_logits": -9.910186767578125, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -22.29466438293457, "logits_per_token": -3.3033955891927085, "logits_per_char": -0.430877685546875, "num_chars": 23}, {"sum_logits": -15.349700927734375, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.115720748901367, "logits_per_token": -5.116566975911458, "logits_per_char": -0.6977136785333807, "num_chars": 22}, {"sum_logits": -12.88683032989502, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -22.09475326538086, "logits_per_token": -2.577366065979004, "logits_per_char": -0.5857650149952282, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 49, "native_id": "Mercury_SC_400578", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.58562660217285, "incorrect_loss_raw": 25.960463841756184, "correct_loss_per_char": 1.1175284819169478, "incorrect_loss_per_char": 0.8238902323825616, "correct_loss_per_token": 3.5122323717389787, "incorrect_loss_per_token": 3.0254388173421223, "correct_loss_uncond": -2.8294715881347656, "incorrect_loss_uncond": -7.144402821858724}, "model_output": [{"sum_logits": -24.58562660217285, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.415098190307617, "logits_per_token": -3.5122323717389787, "logits_per_char": -1.1175284819169478, "num_chars": 22}, {"sum_logits": -25.43804168701172, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.96864128112793, "logits_per_token": -3.179755210876465, "logits_per_char": -0.9085014888218471, "num_chars": 28}, {"sum_logits": -26.08905029296875, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.072906494140625, "logits_per_token": -3.2611312866210938, "logits_per_char": -0.869635009765625, "num_chars": 30}, {"sum_logits": -26.354299545288086, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.27305221557617, "logits_per_token": -2.6354299545288087, "logits_per_char": -0.6935341985602128, "num_chars": 38}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 50, "native_id": "MCAS_2001_8_4", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.615158081054688, "incorrect_loss_raw": 27.278783798217773, "correct_loss_per_char": 0.6189571646756904, "incorrect_loss_per_char": 0.5569156308155924, "correct_loss_per_token": 2.957239786783854, "incorrect_loss_per_token": 2.7278783798217767, "correct_loss_uncond": -20.065303802490234, "incorrect_loss_uncond": -23.095067977905273}, "model_output": [{"sum_logits": -24.83185577392578, "num_tokens": 10, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -47.88705825805664, "logits_per_token": -2.483185577392578, "logits_per_char": -0.5283373568920379, "num_chars": 47}, {"sum_logits": -28.19489288330078, "num_tokens": 10, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -53.11500549316406, "logits_per_token": -2.819489288330078, "logits_per_char": -0.542209478525015, "num_chars": 52}, {"sum_logits": -28.809602737426758, "num_tokens": 10, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -50.11949157714844, "logits_per_token": -2.8809602737426756, "logits_per_char": -0.6002000570297241, "num_chars": 48}, {"sum_logits": -26.615158081054688, "num_tokens": 9, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -46.68046188354492, "logits_per_token": -2.957239786783854, "logits_per_char": -0.6189571646756904, "num_chars": 43}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 51, "native_id": "MCAS_2003_5_33", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.753625869750977, "incorrect_loss_raw": 18.444555282592773, "correct_loss_per_char": 0.586050808429718, "incorrect_loss_per_char": 0.5458844016326612, "correct_loss_per_token": 2.6790894099644254, "incorrect_loss_per_token": 3.6562231526230318, "correct_loss_uncond": -12.092140197753906, "incorrect_loss_uncond": -11.401593526204428}, "model_output": [{"sum_logits": -16.089569091796875, "num_tokens": 3, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -28.183429718017578, "logits_per_token": -5.363189697265625, "logits_per_char": -0.6995464822520381, "num_chars": 23}, {"sum_logits": -12.809246063232422, "num_tokens": 4, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -22.684465408325195, "logits_per_token": -3.2023115158081055, "logits_per_char": -0.45747307368687223, "num_chars": 28}, {"sum_logits": -18.753625869750977, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -30.845766067504883, "logits_per_token": -2.6790894099644254, "logits_per_char": -0.586050808429718, "num_chars": 32}, {"sum_logits": -26.434850692749023, "num_tokens": 11, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -38.67055130004883, "logits_per_token": -2.4031682447953657, "logits_per_char": -0.4806336489590732, "num_chars": 55}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 52, "native_id": "Mercury_7068513", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.853212356567383, "incorrect_loss_raw": 15.501486778259277, "correct_loss_per_char": 0.8115096525712446, "incorrect_loss_per_char": 1.0147187517349245, "correct_loss_per_token": 5.951070785522461, "incorrect_loss_per_token": 5.851045555538601, "correct_loss_uncond": -9.529104232788086, "incorrect_loss_uncond": -9.073046684265137}, "model_output": [{"sum_logits": -17.853212356567383, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.38231658935547, "logits_per_token": -5.951070785522461, "logits_per_char": -0.8115096525712446, "num_chars": 22}, {"sum_logits": -18.21554946899414, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.06772804260254, "logits_per_token": -6.071849822998047, "logits_per_char": -0.8674071175711495, "num_chars": 21}, {"sum_logits": -12.30989933013916, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.325824737548828, "logits_per_token": -6.15494966506958, "logits_per_char": -0.7241117253023035, "num_chars": 17}, {"sum_logits": -15.979011535644531, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -23.330047607421875, "logits_per_token": -5.326337178548177, "logits_per_char": -1.452637412331321, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 53, "native_id": "AKDE&ED_2008_4_26", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.67620086669922, "incorrect_loss_raw": 24.020204544067383, "correct_loss_per_char": 0.8193200247628348, "incorrect_loss_per_char": 0.7157516366889364, "correct_loss_per_token": 3.1862445407443576, "incorrect_loss_per_token": 3.4287129773033995, "correct_loss_uncond": -6.9710235595703125, "incorrect_loss_uncond": -10.674479166666666}, "model_output": [{"sum_logits": -23.47753143310547, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -31.433523178100586, "logits_per_token": -2.9346914291381836, "logits_per_char": -0.7336728572845459, "num_chars": 32}, {"sum_logits": -30.68549346923828, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -38.25994873046875, "logits_per_token": -5.114248911539714, "logits_per_char": -0.9298634384617661, "num_chars": 33}, {"sum_logits": -28.67620086669922, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -35.64722442626953, "logits_per_token": -3.1862445407443576, "logits_per_char": -0.8193200247628348, "num_chars": 35}, {"sum_logits": -17.8975887298584, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.39057922363281, "logits_per_token": -2.2371985912323, "logits_per_char": -0.48371861432049723, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 54, "native_id": "Mercury_7235638", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.091793537139893, "incorrect_loss_raw": 6.071674505869548, "correct_loss_per_char": 0.2995172668905819, "incorrect_loss_per_char": 0.375763499945925, "correct_loss_per_token": 2.5458967685699463, "incorrect_loss_per_token": 2.2312257289886475, "correct_loss_uncond": -14.35222578048706, "incorrect_loss_uncond": -12.6114608446757}, "model_output": [{"sum_logits": -4.856331825256348, "num_tokens": 3, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.66053581237793, "logits_per_token": -1.6187772750854492, "logits_per_char": -0.2555964118555972, "num_chars": 19}, {"sum_logits": -6.417783737182617, "num_tokens": 4, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -21.533672332763672, "logits_per_token": -1.6044459342956543, "logits_per_char": -0.33777809143066406, "num_chars": 19}, {"sum_logits": -5.091793537139893, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.444019317626953, "logits_per_token": -2.5458967685699463, "logits_per_char": -0.2995172668905819, "num_chars": 17}, {"sum_logits": -6.940907955169678, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -17.85519790649414, "logits_per_token": -3.470453977584839, "logits_per_char": -0.5339159965515137, "num_chars": 13}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 55, "native_id": "MDSA_2009_5_20", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.09518814086914, "incorrect_loss_raw": 30.103071212768555, "correct_loss_per_char": 0.6301206544388173, "incorrect_loss_per_char": 0.549343868105, "correct_loss_per_token": 3.3868985176086426, "incorrect_loss_per_token": 2.4166811119426383, "correct_loss_uncond": -10.074031829833984, "incorrect_loss_uncond": -6.6826527913411455}, "model_output": [{"sum_logits": -23.17238426208496, "num_tokens": 10, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -30.106748580932617, "logits_per_token": -2.317238426208496, "logits_per_char": -0.6436773406134711, "num_chars": 36}, {"sum_logits": -27.09518814086914, "num_tokens": 8, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -37.169219970703125, "logits_per_token": -3.3868985176086426, "logits_per_char": -0.6301206544388173, "num_chars": 43}, {"sum_logits": -25.93370819091797, "num_tokens": 11, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -29.765499114990234, "logits_per_token": -2.357609835537997, "logits_per_char": -0.4893152488852447, "num_chars": 53}, {"sum_logits": -41.203121185302734, "num_tokens": 16, "num_tokens_all": 250, "is_greedy": false, "sum_logits_uncond": -50.48492431640625, "logits_per_token": -2.575195074081421, "logits_per_char": -0.5150390148162842, "num_chars": 80}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 56, "native_id": "Mercury_178325", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.432247638702393, "incorrect_loss_raw": 7.108798662821452, "correct_loss_per_char": 0.8040309548377991, "incorrect_loss_per_char": 0.9300772110621134, "correct_loss_per_token": 3.2161238193511963, "incorrect_loss_per_token": 3.554399331410726, "correct_loss_uncond": -7.5224690437316895, "incorrect_loss_uncond": -6.480605761210124}, "model_output": [{"sum_logits": -8.13947868347168, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.186142921447754, "logits_per_token": -4.06973934173584, "logits_per_char": -1.3565797805786133, "num_chars": 6}, {"sum_logits": -8.588512420654297, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.071273803710938, "logits_per_token": -4.294256210327148, "logits_per_char": -0.8588512420654297, "num_chars": 10}, {"sum_logits": -6.432247638702393, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.954716682434082, "logits_per_token": -3.2161238193511963, "logits_per_char": -0.8040309548377991, "num_chars": 8}, {"sum_logits": -4.598404884338379, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.510796546936035, "logits_per_token": -2.2992024421691895, "logits_per_char": -0.5748006105422974, "num_chars": 8}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 57, "native_id": "Mercury_7212678", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.025823593139648, "incorrect_loss_raw": 18.28010908762614, "correct_loss_per_char": 0.344556987285614, "incorrect_loss_per_char": 0.557068549091421, "correct_loss_per_token": 2.2051647186279295, "incorrect_loss_per_token": 2.732113874526251, "correct_loss_uncond": -13.029058456420898, "incorrect_loss_uncond": -14.76187547047933}, "model_output": [{"sum_logits": -11.025823593139648, "num_tokens": 5, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -24.054882049560547, "logits_per_token": -2.2051647186279295, "logits_per_char": -0.344556987285614, "num_chars": 32}, {"sum_logits": -14.939725875854492, "num_tokens": 7, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -28.12175750732422, "logits_per_token": -2.1342465536934987, "logits_per_char": -0.3830698942526793, "num_chars": 39}, {"sum_logits": -14.326931953430176, "num_tokens": 5, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -27.855056762695312, "logits_per_token": -2.865386390686035, "logits_per_char": -0.5969554980595907, "num_chars": 24}, {"sum_logits": -25.57366943359375, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -43.149139404296875, "logits_per_token": -3.1967086791992188, "logits_per_char": -0.6911802549619932, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 58, "native_id": "TAKS_2009_8_32", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.825258255004883, "incorrect_loss_raw": 9.330842812856039, "correct_loss_per_char": 0.8187715212504069, "incorrect_loss_per_char": 0.7573367386153249, "correct_loss_per_token": 4.912629127502441, "incorrect_loss_per_token": 4.6654214064280195, "correct_loss_uncond": -3.6001930236816406, "incorrect_loss_uncond": -4.357980569203694}, "model_output": [{"sum_logits": -9.825258255004883, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.425451278686523, "logits_per_token": -4.912629127502441, "logits_per_char": -0.8187715212504069, "num_chars": 12}, {"sum_logits": -7.415235996246338, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.367237091064453, "logits_per_token": -3.707617998123169, "logits_per_char": -0.7415235996246338, "num_chars": 10}, {"sum_logits": -8.6030855178833, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.43526840209961, "logits_per_token": -4.30154275894165, "logits_per_char": -0.7820986834439364, "num_chars": 11}, {"sum_logits": -11.974206924438477, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.263964653015137, "logits_per_token": -5.987103462219238, "logits_per_char": -0.7483879327774048, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 59, "native_id": "Mercury_412681", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.170597076416016, "incorrect_loss_raw": 24.55001958211263, "correct_loss_per_char": 0.2900081791289865, "incorrect_loss_per_char": 0.33692043405320177, "correct_loss_per_token": 0.9204607424528702, "incorrect_loss_per_token": 1.0628062435865506, "correct_loss_uncond": -29.537471771240234, "incorrect_loss_uncond": -29.529136657714844}, "model_output": [{"sum_logits": -27.601131439208984, "num_tokens": 24, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -55.6745719909668, "logits_per_token": -1.1500471433003743, "logits_per_char": -0.38334904776679146, "num_chars": 72}, {"sum_logits": -27.709304809570312, "num_tokens": 23, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -57.274688720703125, "logits_per_token": -1.2047523830247961, "logits_per_char": -0.3795795179393193, "num_chars": 73}, {"sum_logits": -21.170597076416016, "num_tokens": 23, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -50.70806884765625, "logits_per_token": -0.9204607424528702, "logits_per_char": -0.2900081791289865, "num_chars": 73}, {"sum_logits": -18.339622497558594, "num_tokens": 22, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -49.2882080078125, "logits_per_token": -0.8336192044344816, "logits_per_char": -0.24783273645349452, "num_chars": 74}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 60, "native_id": "Mercury_400440", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.823034286499023, "incorrect_loss_raw": 12.779888470967611, "correct_loss_per_char": 1.0587881633213587, "incorrect_loss_per_char": 1.1640021642049154, "correct_loss_per_token": 2.1175763266427174, "incorrect_loss_per_token": 2.6250387032826743, "correct_loss_uncond": -12.724180221557617, "incorrect_loss_uncond": -12.9158722559611}, "model_output": [{"sum_logits": -20.517602920532227, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.262720108032227, "logits_per_token": -3.419600486755371, "logits_per_char": -1.7098002433776855, "num_chars": 12}, {"sum_logits": -12.694402694702148, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.171480178833008, "logits_per_token": -3.173600673675537, "logits_per_char": -1.269440269470215, "num_chars": 10}, {"sum_logits": -5.127659797668457, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -21.6530818939209, "logits_per_token": -1.2819149494171143, "logits_per_char": -0.5127659797668457, "num_chars": 10}, {"sum_logits": -14.823034286499023, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.54721450805664, "logits_per_token": -2.1175763266427174, "logits_per_char": -1.0587881633213587, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 61, "native_id": "Mercury_SC_416529", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.011335372924805, "incorrect_loss_raw": 11.695802052815756, "correct_loss_per_char": 0.35056676864624026, "incorrect_loss_per_char": 0.6711795184880464, "correct_loss_per_token": 2.337111790974935, "incorrect_loss_per_token": 3.672381745444404, "correct_loss_uncond": -17.369638442993164, "incorrect_loss_uncond": -12.364224116007486}, "model_output": [{"sum_logits": -17.99979591369629, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -23.344348907470703, "logits_per_token": -5.999931971232097, "logits_per_char": -1.0588115243350757, "num_chars": 17}, {"sum_logits": -7.011335372924805, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.38097381591797, "logits_per_token": -2.337111790974935, "logits_per_char": -0.35056676864624026, "num_chars": 20}, {"sum_logits": -8.143881797790527, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -28.15995216369629, "logits_per_token": -2.035970449447632, "logits_per_char": -0.4286253577784488, "num_chars": 19}, {"sum_logits": -8.94372844696045, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -20.675777435302734, "logits_per_token": -2.981242815653483, "logits_per_char": -0.5261016733506146, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 62, "native_id": "MCAS_2006_8_1", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.0196536779403687, "incorrect_loss_raw": 5.223788420359294, "correct_loss_per_char": 0.08497113982836406, "incorrect_loss_per_char": 0.499701640341017, "correct_loss_per_token": 1.0196536779403687, "incorrect_loss_per_token": 3.0226848125457764, "correct_loss_uncond": -13.65654718875885, "incorrect_loss_uncond": -9.70965051651001}, "model_output": [{"sum_logits": -4.0818963050842285, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.73462200164795, "logits_per_token": -2.0409481525421143, "logits_per_char": -0.34015802542368573, "num_chars": 12}, {"sum_logits": -4.745924949645996, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -13.774347305297852, "logits_per_token": -4.745924949645996, "logits_per_char": -0.4745924949645996, "num_chars": 10}, {"sum_logits": -6.843544006347656, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -15.29134750366211, "logits_per_token": -2.2811813354492188, "logits_per_char": -0.6843544006347656, "num_chars": 10}, {"sum_logits": -1.0196536779403687, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": true, "sum_logits_uncond": -14.676200866699219, "logits_per_token": -1.0196536779403687, "logits_per_char": -0.08497113982836406, "num_chars": 12}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 63, "native_id": "TIMSS_2003_8_pg80", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.6807663440704346, "incorrect_loss_raw": 4.240473747253418, "correct_loss_per_char": 0.7361532688140869, "incorrect_loss_per_char": 0.7342586812518892, "correct_loss_per_token": 1.8403831720352173, "incorrect_loss_per_token": 4.240473747253418, "correct_loss_uncond": -12.217239141464233, "incorrect_loss_uncond": -8.39238452911377}, "model_output": [{"sum_logits": -4.286013603210449, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.000713348388672, "logits_per_token": -4.286013603210449, "logits_per_char": -0.6122876576014927, "num_chars": 7}, {"sum_logits": -2.897794246673584, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.150640487670898, "logits_per_token": -2.897794246673584, "logits_per_char": -0.48296570777893066, "num_chars": 6}, {"sum_logits": -3.6807663440704346, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.898005485534668, "logits_per_token": -1.8403831720352173, "logits_per_char": -0.7361532688140869, "num_chars": 5}, {"sum_logits": -5.537613391876221, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.747220993041992, "logits_per_token": -5.537613391876221, "logits_per_char": -1.107522678375244, "num_chars": 5}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 64, "native_id": "Mercury_416645", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.16506576538086, "incorrect_loss_raw": 12.935249964396158, "correct_loss_per_char": 0.4791266441345215, "incorrect_loss_per_char": 0.382756017231675, "correct_loss_per_token": 2.3956332206726074, "incorrect_loss_per_token": 1.7029420194171723, "correct_loss_uncond": -16.377872467041016, "incorrect_loss_uncond": -12.187599182128906}, "model_output": [{"sum_logits": -13.980840682983398, "num_tokens": 8, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -23.337596893310547, "logits_per_token": -1.7476050853729248, "logits_per_char": -0.4236618388782848, "num_chars": 33}, {"sum_logits": -14.454010009765625, "num_tokens": 7, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -30.237173080444336, "logits_per_token": -2.0648585728236606, "logits_per_char": -0.4516878128051758, "num_chars": 32}, {"sum_logits": -10.370899200439453, "num_tokens": 8, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -21.793777465820312, "logits_per_token": -1.2963624000549316, "logits_per_char": -0.27291840001156453, "num_chars": 38}, {"sum_logits": -19.16506576538086, "num_tokens": 8, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -35.542938232421875, "logits_per_token": -2.3956332206726074, "logits_per_char": -0.4791266441345215, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 65, "native_id": "Mercury_406777", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.05935287475586, "incorrect_loss_raw": 21.709571838378906, "correct_loss_per_char": 0.5541934967041016, "incorrect_loss_per_char": 0.5625501134311178, "correct_loss_per_token": 2.6324191093444824, "incorrect_loss_per_token": 3.2629321113465326, "correct_loss_uncond": -10.479206085205078, "incorrect_loss_uncond": -12.490618387858072}, "model_output": [{"sum_logits": -21.05935287475586, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -31.538558959960938, "logits_per_token": -2.6324191093444824, "logits_per_char": -0.5541934967041016, "num_chars": 38}, {"sum_logits": -20.357152938842773, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.87384796142578, "logits_per_token": -3.3928588231404624, "logits_per_char": -0.5816329411097936, "num_chars": 35}, {"sum_logits": -22.9163761138916, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -35.20536422729492, "logits_per_token": -3.2737680162702287, "logits_per_char": -0.5456280027117048, "num_chars": 42}, {"sum_logits": -21.855186462402344, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -33.521358489990234, "logits_per_token": -3.1221694946289062, "logits_per_char": -0.5603893964718549, "num_chars": 39}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 66, "native_id": "Mercury_LBS11018", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.529659271240234, "incorrect_loss_raw": 13.105818430582682, "correct_loss_per_char": 0.3860345681508382, "incorrect_loss_per_char": 0.5925766330918455, "correct_loss_per_token": 1.6845144792036577, "incorrect_loss_per_token": 2.1516902923583983, "correct_loss_uncond": -20.212528228759766, "incorrect_loss_uncond": -21.36539077758789}, "model_output": [{"sum_logits": -18.529659271240234, "num_tokens": 11, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -38.7421875, "logits_per_token": -1.6845144792036577, "logits_per_char": -0.3860345681508382, "num_chars": 48}, {"sum_logits": -14.08420181274414, "num_tokens": 10, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -38.79602813720703, "logits_per_token": -1.4084201812744142, "logits_per_char": -0.29342087109883624, "num_chars": 48}, {"sum_logits": -13.560400009155273, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -32.087257385253906, "logits_per_token": -2.7120800018310547, "logits_per_char": -0.7976705887738396, "num_chars": 17}, {"sum_logits": -11.672853469848633, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -32.53034210205078, "logits_per_token": -2.3345706939697264, "logits_per_char": -0.6866384394028607, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 67, "native_id": "Mercury_7139878", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.219393730163574, "incorrect_loss_raw": 13.490390141805014, "correct_loss_per_char": 0.7344107627868652, "incorrect_loss_per_char": 0.7302241184093333, "correct_loss_per_token": 3.3048484325408936, "incorrect_loss_per_token": 4.728665828704834, "correct_loss_uncond": -13.814888954162598, "incorrect_loss_uncond": -10.10593350728353}, "model_output": [{"sum_logits": -17.271230697631836, "num_tokens": 4, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -25.615802764892578, "logits_per_token": -4.317807674407959, "logits_per_char": -0.959512816535102, "num_chars": 18}, {"sum_logits": -12.809259414672852, "num_tokens": 2, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -20.93571662902832, "logits_per_token": -6.404629707336426, "logits_per_char": -0.7116255230373807, "num_chars": 18}, {"sum_logits": -13.219393730163574, "num_tokens": 4, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -27.034282684326172, "logits_per_token": -3.3048484325408936, "logits_per_char": -0.7344107627868652, "num_chars": 18}, {"sum_logits": -10.390680313110352, "num_tokens": 3, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -24.237451553344727, "logits_per_token": -3.463560104370117, "logits_per_char": -0.5195340156555176, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 68, "native_id": "Mercury_417147", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.275529861450195, "incorrect_loss_raw": 14.456790924072266, "correct_loss_per_char": 0.6321680886404855, "incorrect_loss_per_char": 0.7434662137712751, "correct_loss_per_token": 2.655105972290039, "incorrect_loss_per_token": 3.399027294582791, "correct_loss_uncond": -15.800168991088867, "incorrect_loss_uncond": -11.487666447957357}, "model_output": [{"sum_logits": -14.944927215576172, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.020816802978516, "logits_per_token": -3.736231803894043, "logits_per_char": -1.0674948011125838, "num_chars": 14}, {"sum_logits": -13.275529861450195, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.075698852539062, "logits_per_token": -2.655105972290039, "logits_per_char": -0.6321680886404855, "num_chars": 21}, {"sum_logits": -10.339654922485352, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -22.808427810668945, "logits_per_token": -3.4465516408284507, "logits_per_char": -0.5169827461242675, "num_chars": 20}, {"sum_logits": -18.085790634155273, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.004127502441406, "logits_per_token": -3.014298439025879, "logits_per_char": -0.6459210940769741, "num_chars": 28}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 69, "native_id": "Mercury_7016765", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.368213653564453, "incorrect_loss_raw": 25.394262313842773, "correct_loss_per_char": 0.6202933447701591, "incorrect_loss_per_char": 0.7224726788199313, "correct_loss_per_token": 2.4811733790806363, "incorrect_loss_per_token": 3.9447818067338734, "correct_loss_uncond": -5.624393463134766, "incorrect_loss_uncond": -5.82881228129069}, "model_output": [{"sum_logits": -17.368213653564453, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -22.99260711669922, "logits_per_token": -2.4811733790806363, "logits_per_char": -0.6202933447701591, "num_chars": 28}, {"sum_logits": -28.40737533569336, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -39.14636993408203, "logits_per_token": -4.7345625559488935, "logits_per_char": -0.8116392953055246, "num_chars": 35}, {"sum_logits": -27.068553924560547, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -31.129159927368164, "logits_per_token": -4.511425654093425, "logits_per_char": -0.7961339389576632, "num_chars": 34}, {"sum_logits": -20.706857681274414, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -23.393693923950195, "logits_per_token": -2.5883572101593018, "logits_per_char": -0.5596448021966058, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 70, "native_id": "Mercury_415303", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.8611788749694824, "incorrect_loss_raw": 4.235520760218303, "correct_loss_per_char": 1.9305894374847412, "incorrect_loss_per_char": 2.1177603801091514, "correct_loss_per_token": 3.8611788749694824, "incorrect_loss_per_token": 4.235520760218303, "correct_loss_uncond": -1.9166293144226074, "incorrect_loss_uncond": -0.9010277589162191}, "model_output": [{"sum_logits": -3.8611788749694824, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -3.8611788749694824, "logits_per_char": -1.9305894374847412, "num_chars": 2}, {"sum_logits": -4.773500442504883, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -5.551112174987793, "logits_per_token": -4.773500442504883, "logits_per_char": -2.3867502212524414, "num_chars": 2}, {"sum_logits": -4.364684581756592, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -5.410497665405273, "logits_per_token": -4.364684581756592, "logits_per_char": -2.182342290878296, "num_chars": 2}, {"sum_logits": -3.5683772563934326, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -4.448035717010498, "logits_per_token": -3.5683772563934326, "logits_per_char": -1.7841886281967163, "num_chars": 2}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 71, "native_id": "Mercury_7215845", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.270228385925293, "incorrect_loss_raw": 10.255584557851156, "correct_loss_per_char": 0.7510646518908048, "incorrect_loss_per_char": 0.6413870984350729, "correct_loss_per_token": 3.5675570964813232, "incorrect_loss_per_token": 3.278381043010288, "correct_loss_uncond": -6.552824974060059, "incorrect_loss_uncond": -8.371724923451742}, "model_output": [{"sum_logits": -12.246793746948242, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.73375701904297, "logits_per_token": -4.082264582316081, "logits_per_char": -0.8164529164632162, "num_chars": 15}, {"sum_logits": -13.474662780761719, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -20.189922332763672, "logits_per_token": -4.491554260253906, "logits_per_char": -0.8421664237976074, "num_chars": 16}, {"sum_logits": -14.270228385925293, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -20.82305335998535, "logits_per_token": -3.5675570964813232, "logits_per_char": -0.7510646518908048, "num_chars": 19}, {"sum_logits": -5.045297145843506, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -15.95824909210205, "logits_per_token": -1.2613242864608765, "logits_per_char": -0.26554195504439504, "num_chars": 19}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 72, "native_id": "Mercury_7136885", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.132079124450684, "incorrect_loss_raw": 14.08973757425944, "correct_loss_per_char": 0.3979417916500207, "incorrect_loss_per_char": 0.43697534002290167, "correct_loss_per_token": 1.8760113034929549, "incorrect_loss_per_token": 2.268185461892022, "correct_loss_uncond": -23.52319049835205, "incorrect_loss_uncond": -14.957493464152018}, "model_output": [{"sum_logits": -11.800020217895508, "num_tokens": 5, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -24.659191131591797, "logits_per_token": -2.3600040435791017, "logits_per_char": -0.5363645553588867, "num_chars": 22}, {"sum_logits": -13.132079124450684, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -36.655269622802734, "logits_per_token": -1.8760113034929549, "logits_per_char": -0.3979417916500207, "num_chars": 33}, {"sum_logits": -15.261678695678711, "num_tokens": 6, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -30.599857330322266, "logits_per_token": -2.5436131159464517, "logits_per_char": -0.4124778025859111, "num_chars": 37}, {"sum_logits": -15.207513809204102, "num_tokens": 8, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -31.882644653320312, "logits_per_token": -1.9009392261505127, "logits_per_char": -0.3620836621239072, "num_chars": 42}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 73, "native_id": "Mercury_SC_400059", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.430605888366699, "incorrect_loss_raw": 13.186315218607584, "correct_loss_per_char": 0.297224235534668, "incorrect_loss_per_char": 0.7497828901381719, "correct_loss_per_token": 1.4861211776733398, "incorrect_loss_per_token": 3.5804824193318687, "correct_loss_uncond": -19.63345432281494, "incorrect_loss_uncond": -14.240914344787598}, "model_output": [{"sum_logits": -8.582184791564941, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -22.989707946777344, "logits_per_token": -2.8607282638549805, "logits_per_char": -0.6130131993974958, "num_chars": 14}, {"sum_logits": -12.640251159667969, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -27.280803680419922, "logits_per_token": -4.213417053222656, "logits_per_char": -0.9028750828334263, "num_chars": 14}, {"sum_logits": -18.336509704589844, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -32.01117706298828, "logits_per_token": -3.6673019409179686, "logits_per_char": -0.7334603881835937, "num_chars": 25}, {"sum_logits": -7.430605888366699, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -27.06406021118164, "logits_per_token": -1.4861211776733398, "logits_per_char": -0.297224235534668, "num_chars": 25}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 74, "native_id": "Mercury_7044328", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.094627380371094, "incorrect_loss_raw": 20.979564666748047, "correct_loss_per_char": 0.5325434159259407, "incorrect_loss_per_char": 0.5199394304790194, "correct_loss_per_token": 3.2618284225463867, "incorrect_loss_per_token": 3.311528894636366, "correct_loss_uncond": -11.24493408203125, "incorrect_loss_uncond": -9.997924168904623}, "model_output": [{"sum_logits": -20.389163970947266, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.705888748168945, "logits_per_token": -3.3981939951578775, "logits_per_char": -0.5825475420270647, "num_chars": 35}, {"sum_logits": -16.236019134521484, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.558467864990234, "logits_per_token": -3.247203826904297, "logits_per_char": -0.45100053151448566, "num_chars": 36}, {"sum_logits": -26.31351089477539, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.66810989379883, "logits_per_token": -3.289188861846924, "logits_per_char": -0.5262702178955078, "num_chars": 50}, {"sum_logits": -26.094627380371094, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -37.339561462402344, "logits_per_token": -3.2618284225463867, "logits_per_char": -0.5325434159259407, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 75, "native_id": "MEA_2010_8_1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.857959747314453, "incorrect_loss_raw": 12.416861216227213, "correct_loss_per_char": 0.7583045959472656, "incorrect_loss_per_char": 0.35284263013130546, "correct_loss_per_token": 3.285986582438151, "incorrect_loss_per_token": 1.3845337055347582, "correct_loss_uncond": -9.093774795532227, "incorrect_loss_uncond": -13.767760594685873}, "model_output": [{"sum_logits": -9.857959747314453, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.95173454284668, "logits_per_token": -3.285986582438151, "logits_per_char": -0.7583045959472656, "num_chars": 13}, {"sum_logits": -5.741381645202637, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.35190773010254, "logits_per_token": -1.1482763290405273, "logits_per_char": -0.2733991259620303, "num_chars": 21}, {"sum_logits": -13.66408634185791, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.88288116455078, "logits_per_token": -1.51823181576199, "logits_per_char": -0.3795579539404975, "num_chars": 36}, {"sum_logits": -17.845115661621094, "num_tokens": 12, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -34.31907653808594, "logits_per_token": -1.4870929718017578, "logits_per_char": -0.4055708104913885, "num_chars": 44}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 76, "native_id": "Mercury_414099", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.831405639648438, "incorrect_loss_raw": 16.354807535807293, "correct_loss_per_char": 0.46119547999182414, "incorrect_loss_per_char": 0.3843180191711965, "correct_loss_per_token": 2.4789257049560547, "incorrect_loss_per_token": 2.1106164205641975, "correct_loss_uncond": -24.43604278564453, "incorrect_loss_uncond": -22.416255950927734}, "model_output": [{"sum_logits": -11.10316276550293, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -30.62347412109375, "logits_per_token": -1.5861661093575614, "logits_per_char": -0.30842118793063694, "num_chars": 36}, {"sum_logits": -19.831405639648438, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -44.26744842529297, "logits_per_token": -2.4789257049560547, "logits_per_char": -0.46119547999182414, "num_chars": 43}, {"sum_logits": -22.156333923339844, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -47.233951568603516, "logits_per_token": -3.1651905604771207, "logits_per_char": -0.5152635796125545, "num_chars": 43}, {"sum_logits": -15.804925918579102, "num_tokens": 10, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -38.45576477050781, "logits_per_token": -1.5804925918579102, "logits_per_char": -0.32926928997039795, "num_chars": 48}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 77, "native_id": "Mercury_410807", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.657766342163086, "incorrect_loss_raw": 18.46298599243164, "correct_loss_per_char": 0.3714387924944768, "incorrect_loss_per_char": 0.3887093680351394, "correct_loss_per_token": 2.059796940196644, "incorrect_loss_per_token": 1.9480085141731032, "correct_loss_uncond": -16.044466018676758, "incorrect_loss_uncond": -22.12422053019206}, "model_output": [{"sum_logits": -18.648681640625, "num_tokens": 8, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -45.414249420166016, "logits_per_token": -2.331085205078125, "logits_per_char": -0.47817132411858976, "num_chars": 39}, {"sum_logits": -22.657766342163086, "num_tokens": 11, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -38.702232360839844, "logits_per_token": -2.059796940196644, "logits_per_char": -0.3714387924944768, "num_chars": 61}, {"sum_logits": -19.020673751831055, "num_tokens": 10, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -32.75313186645508, "logits_per_token": -1.9020673751831054, "logits_per_char": -0.3657821875352126, "num_chars": 52}, {"sum_logits": -17.719602584838867, "num_tokens": 11, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -43.59423828125, "logits_per_token": -1.6108729622580789, "logits_per_char": -0.3221745924516158, "num_chars": 55}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 78, "native_id": "Mercury_403234", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.343740463256836, "incorrect_loss_raw": 18.88895098368327, "correct_loss_per_char": 0.4717985478843131, "incorrect_loss_per_char": 0.4285971894248241, "correct_loss_per_token": 2.7633914947509766, "incorrect_loss_per_token": 2.02260529200236, "correct_loss_uncond": -7.53071403503418, "incorrect_loss_uncond": -10.58499272664388}, "model_output": [{"sum_logits": -22.56859588623047, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.497169494628906, "logits_per_token": -2.2568595886230467, "logits_per_char": -0.48018289119639296, "num_chars": 47}, {"sum_logits": -19.343740463256836, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -26.874454498291016, "logits_per_token": -2.7633914947509766, "logits_per_char": -0.4717985478843131, "num_chars": 41}, {"sum_logits": -16.045223236083984, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -29.19464111328125, "logits_per_token": -2.005652904510498, "logits_per_char": -0.45843494960239956, "num_chars": 35}, {"sum_logits": -18.05303382873535, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.73002052307129, "logits_per_token": -1.8053033828735352, "logits_per_char": -0.3471737274756798, "num_chars": 52}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 79, "native_id": "Mercury_7011323", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.880535125732422, "incorrect_loss_raw": 9.568838755289713, "correct_loss_per_char": 0.9880535125732421, "incorrect_loss_per_char": 0.7724618776880129, "correct_loss_per_token": 4.940267562866211, "incorrect_loss_per_token": 5.192153559790717, "correct_loss_uncond": -6.608249664306641, "incorrect_loss_uncond": -6.968087514241536}, "model_output": [{"sum_logits": -9.880535125732422, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.488784790039062, "logits_per_token": -4.940267562866211, "logits_per_char": -0.9880535125732421, "num_chars": 10}, {"sum_logits": -10.18373966217041, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.87687873840332, "logits_per_token": -5.091869831085205, "logits_per_char": -0.9257945147427645, "num_chars": 11}, {"sum_logits": -6.465497970581055, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.29965591430664, "logits_per_token": -6.465497970581055, "logits_per_char": -0.5877725427800958, "num_chars": 11}, {"sum_logits": -12.057278633117676, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.43424415588379, "logits_per_token": -4.019092877705892, "logits_per_char": -0.8038185755411784, "num_chars": 15}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 80, "native_id": "Mercury_7109463", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.620692253112793, "incorrect_loss_raw": 10.3297758102417, "correct_loss_per_char": 0.4084881635812613, "incorrect_loss_per_char": 0.5051066745383345, "correct_loss_per_token": 2.6551730632781982, "incorrect_loss_per_token": 3.2908428774939646, "correct_loss_uncond": -13.581738471984863, "incorrect_loss_uncond": -11.136907895406088}, "model_output": [{"sum_logits": -15.487165451049805, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -21.792190551757812, "logits_per_token": -5.1623884836832685, "logits_per_char": -0.860398080613878, "num_chars": 18}, {"sum_logits": -10.015195846557617, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -22.784414291381836, "logits_per_token": -3.3383986155192056, "logits_per_char": -0.43544329767641815, "num_chars": 23}, {"sum_logits": -5.486966133117676, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.82344627380371, "logits_per_token": -1.371741533279419, "logits_per_char": -0.21947864532470704, "num_chars": 25}, {"sum_logits": -10.620692253112793, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -24.202430725097656, "logits_per_token": -2.6551730632781982, "logits_per_char": -0.4084881635812613, "num_chars": 26}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 81, "native_id": "Mercury_SC_401277", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.42153263092041, "incorrect_loss_raw": 7.6357496579488116, "correct_loss_per_char": 0.40134578943252563, "incorrect_loss_per_char": 0.38299968827742664, "correct_loss_per_token": 3.210766315460205, "incorrect_loss_per_token": 3.170002063115438, "correct_loss_uncond": -18.818974494934082, "incorrect_loss_uncond": -15.252114137013754}, "model_output": [{"sum_logits": -7.774473190307617, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -24.49878692626953, "logits_per_token": -1.9436182975769043, "logits_per_char": -0.31097892761230467, "num_chars": 25}, {"sum_logits": -6.711658000946045, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -19.586332321166992, "logits_per_token": -3.3558290004730225, "logits_per_char": -0.3948034118203556, "num_chars": 17}, {"sum_logits": -6.42153263092041, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -25.240507125854492, "logits_per_token": -3.210766315460205, "logits_per_char": -0.40134578943252563, "num_chars": 16}, {"sum_logits": -8.421117782592773, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -24.578472137451172, "logits_per_token": -4.210558891296387, "logits_per_char": -0.44321672539961965, "num_chars": 19}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 82, "native_id": "MCAS_2005_5_25", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.704893589019775, "incorrect_loss_raw": 7.967535972595215, "correct_loss_per_char": 0.6338770654466417, "incorrect_loss_per_char": 0.9445082603938996, "correct_loss_per_token": 1.901631196339925, "incorrect_loss_per_token": 2.655845324198405, "correct_loss_uncond": -10.677173137664795, "incorrect_loss_uncond": -9.719110806783041}, "model_output": [{"sum_logits": -7.828768730163574, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.850234985351562, "logits_per_token": -2.6095895767211914, "logits_per_char": -1.1183955328805106, "num_chars": 7}, {"sum_logits": -6.3767595291137695, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.87716293334961, "logits_per_token": -2.12558650970459, "logits_per_char": -0.6376759529113769, "num_chars": 10}, {"sum_logits": -5.704893589019775, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.38206672668457, "logits_per_token": -1.901631196339925, "logits_per_char": -0.6338770654466417, "num_chars": 9}, {"sum_logits": -9.6970796585083, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -19.332542419433594, "logits_per_token": -3.2323598861694336, "logits_per_char": -1.0774532953898113, "num_chars": 9}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 83, "native_id": "Mercury_SC_401272", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.717212677001953, "incorrect_loss_raw": 15.484755833943685, "correct_loss_per_char": 0.4224875076957371, "incorrect_loss_per_char": 1.0529174607144103, "correct_loss_per_token": 2.4293031692504883, "incorrect_loss_per_token": 5.161585277981228, "correct_loss_uncond": -7.382595062255859, "incorrect_loss_uncond": -5.4034849802653}, "model_output": [{"sum_logits": -15.625025749206543, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -22.49529266357422, "logits_per_token": -5.208341916402181, "logits_per_char": -1.2019250576312726, "num_chars": 13}, {"sum_logits": -18.27617073059082, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -22.05172348022461, "logits_per_token": -6.09205691019694, "logits_per_char": -1.218411382039388, "num_chars": 15}, {"sum_logits": -12.553071022033691, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -18.117706298828125, "logits_per_token": -4.1843570073445635, "logits_per_char": -0.73841594247257, "num_chars": 17}, {"sum_logits": -9.717212677001953, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -17.099807739257812, "logits_per_token": -2.4293031692504883, "logits_per_char": -0.4224875076957371, "num_chars": 23}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 84, "native_id": "Mercury_7103600", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.3204400539398193, "incorrect_loss_raw": 8.078768809636435, "correct_loss_per_char": 0.17476000283893786, "incorrect_loss_per_char": 0.51408617931699, "correct_loss_per_token": 1.1068133513132732, "incorrect_loss_per_token": 2.6929229365454783, "correct_loss_uncond": -17.351703882217407, "incorrect_loss_uncond": -13.580043395360311}, "model_output": [{"sum_logits": -8.509905815124512, "num_tokens": 3, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -19.058368682861328, "logits_per_token": -2.8366352717081704, "logits_per_char": -0.6078504153660366, "num_chars": 14}, {"sum_logits": -12.467743873596191, "num_tokens": 3, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -24.079578399658203, "logits_per_token": -4.1559146245320635, "logits_per_char": -0.779233992099762, "num_chars": 16}, {"sum_logits": -3.2586567401885986, "num_tokens": 3, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -21.838489532470703, "logits_per_token": -1.0862189133961995, "logits_per_char": -0.15517413048517137, "num_chars": 21}, {"sum_logits": -3.3204400539398193, "num_tokens": 3, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -20.672143936157227, "logits_per_token": -1.1068133513132732, "logits_per_char": -0.17476000283893786, "num_chars": 19}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 85, "native_id": "MDSA_2009_8_2", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.1722307205200195, "incorrect_loss_raw": 7.598868767420451, "correct_loss_per_char": 0.24135896894666883, "incorrect_loss_per_char": 0.9060552148591904, "correct_loss_per_token": 1.0861153602600098, "incorrect_loss_per_token": 3.7994343837102256, "correct_loss_uncond": -10.341822624206543, "incorrect_loss_uncond": -5.549264828364055}, "model_output": [{"sum_logits": -2.1722307205200195, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -12.514053344726562, "logits_per_token": -1.0861153602600098, "logits_per_char": -0.24135896894666883, "num_chars": 9}, {"sum_logits": -4.723155498504639, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -13.118414878845215, "logits_per_token": -2.3615777492523193, "logits_per_char": -0.5903944373130798, "num_chars": 8}, {"sum_logits": -14.30573558807373, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.147061347961426, "logits_per_token": -7.152867794036865, "logits_per_char": -1.5895261764526367, "num_chars": 9}, {"sum_logits": -3.7677152156829834, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -11.178924560546875, "logits_per_token": -1.8838576078414917, "logits_per_char": -0.5382450308118548, "num_chars": 7}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 86, "native_id": "Mercury_7127943", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 25.226289749145508, "incorrect_loss_raw": 30.445624669392902, "correct_loss_per_char": 0.42043816248575844, "incorrect_loss_per_char": 0.6242213162722235, "correct_loss_per_token": 2.522628974914551, "incorrect_loss_per_token": 3.56887420018514, "correct_loss_uncond": -16.423856735229492, "incorrect_loss_uncond": -10.142881393432617}, "model_output": [{"sum_logits": -25.226289749145508, "num_tokens": 10, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -41.650146484375, "logits_per_token": -2.522628974914551, "logits_per_char": -0.42043816248575844, "num_chars": 60}, {"sum_logits": -40.28110885620117, "num_tokens": 8, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -47.636260986328125, "logits_per_token": -5.0351386070251465, "logits_per_char": -0.8951357523600261, "num_chars": 45}, {"sum_logits": -22.63629913330078, "num_tokens": 8, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -27.20172882080078, "logits_per_token": -2.8295373916625977, "logits_per_char": -0.5264255612395531, "num_chars": 43}, {"sum_logits": -28.419466018676758, "num_tokens": 10, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -46.927528381347656, "logits_per_token": -2.841946601867676, "logits_per_char": -0.4511026352170914, "num_chars": 63}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 87, "native_id": "ACTAAP_2009_7_8", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 49.118194580078125, "incorrect_loss_raw": 40.46920522054037, "correct_loss_per_char": 0.49118194580078123, "incorrect_loss_per_char": 0.5169785349572033, "correct_loss_per_token": 2.135573677394701, "incorrect_loss_per_token": 2.6824913887750537, "correct_loss_uncond": -0.9595413208007812, "incorrect_loss_uncond": -1.3917045593261719}, "model_output": [{"sum_logits": -33.74409484863281, "num_tokens": 15, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -33.473838806152344, "logits_per_token": -2.2496063232421877, "logits_per_char": -0.46866798400878906, "num_chars": 72}, {"sum_logits": -35.716552734375, "num_tokens": 14, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -41.301082611083984, "logits_per_token": -2.551182338169643, "logits_per_char": -0.46385133421266234, "num_chars": 77}, {"sum_logits": -51.94696807861328, "num_tokens": 16, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -50.80780792236328, "logits_per_token": -3.24668550491333, "logits_per_char": -0.6184162866501581, "num_chars": 84}, {"sum_logits": -49.118194580078125, "num_tokens": 23, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -50.077735900878906, "logits_per_token": -2.135573677394701, "logits_per_char": -0.49118194580078123, "num_chars": 100}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 88, "native_id": "MCAS_2006_9_43", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.0808048248291, "incorrect_loss_raw": 18.537102381388348, "correct_loss_per_char": 1.3139080634483924, "incorrect_loss_per_char": 1.3646395528418385, "correct_loss_per_token": 2.1351006031036377, "incorrect_loss_per_token": 2.5997423860761852, "correct_loss_uncond": -14.189546585083008, "incorrect_loss_uncond": -13.197977701822916}, "model_output": [{"sum_logits": -17.927734375, "num_tokens": 8, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -34.06835174560547, "logits_per_token": -2.240966796875, "logits_per_char": -1.1951822916666666, "num_chars": 15}, {"sum_logits": -17.336042404174805, "num_tokens": 8, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -29.803966522216797, "logits_per_token": -2.1670053005218506, "logits_per_char": -1.333541723398062, "num_chars": 13}, {"sum_logits": -17.0808048248291, "num_tokens": 8, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -31.27035140991211, "logits_per_token": -2.1351006031036377, "logits_per_char": -1.3139080634483924, "num_chars": 13}, {"sum_logits": -20.347530364990234, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -31.332921981811523, "logits_per_token": -3.3912550608317056, "logits_per_char": -1.5651946434607873, "num_chars": 13}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 89, "native_id": "Mercury_7252088", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.051280975341797, "incorrect_loss_raw": 16.091908137003582, "correct_loss_per_char": 0.8232600348336356, "incorrect_loss_per_char": 0.5839517633170969, "correct_loss_per_token": 4.610256195068359, "incorrect_loss_per_token": 3.7871994018554687, "correct_loss_uncond": -10.326217651367188, "incorrect_loss_uncond": -15.590429306030273}, "model_output": [{"sum_logits": -15.367977142333984, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.179153442382812, "logits_per_token": -3.841994285583496, "logits_per_char": -0.5691843386049624, "num_chars": 27}, {"sum_logits": -18.761089324951172, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -34.097557067871094, "logits_per_token": -4.690272331237793, "logits_per_char": -0.6948551601833768, "num_chars": 27}, {"sum_logits": -23.051280975341797, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.377498626708984, "logits_per_token": -4.610256195068359, "logits_per_char": -0.8232600348336356, "num_chars": 28}, {"sum_logits": -14.146657943725586, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.770301818847656, "logits_per_token": -2.8293315887451174, "logits_per_char": -0.48781579116295126, "num_chars": 29}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 90, "native_id": "Mercury_7084665", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.01767635345459, "incorrect_loss_raw": 8.53351879119873, "correct_loss_per_char": 0.1913179215930757, "incorrect_loss_per_char": 0.7830959974279725, "correct_loss_per_token": 1.0044190883636475, "incorrect_loss_per_token": 3.0670037269592285, "correct_loss_uncond": -13.045804023742676, "incorrect_loss_uncond": -7.693522771199544}, "model_output": [{"sum_logits": -4.01767635345459, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -17.063480377197266, "logits_per_token": -1.0044190883636475, "logits_per_char": -0.1913179215930757, "num_chars": 21}, {"sum_logits": -11.997556686401367, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -24.731441497802734, "logits_per_token": -2.3995113372802734, "logits_per_char": -0.5216328994087551, "num_chars": 23}, {"sum_logits": -6.592672824859619, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.235304832458496, "logits_per_token": -3.2963364124298096, "logits_per_char": -0.6592672824859619, "num_chars": 10}, {"sum_logits": -7.010326862335205, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -9.714378356933594, "logits_per_token": -3.5051634311676025, "logits_per_char": -1.168387810389201, "num_chars": 6}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 91, "native_id": "FCAT_2008_5_2", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 30.96869659423828, "incorrect_loss_raw": 32.726216634114586, "correct_loss_per_char": 0.5161449432373046, "incorrect_loss_per_char": 0.813034536475064, "correct_loss_per_token": 3.4409662882486978, "incorrect_loss_per_token": 4.725762185596285, "correct_loss_uncond": -6.021366119384766, "incorrect_loss_uncond": -4.283151626586914}, "model_output": [{"sum_logits": -20.050094604492188, "num_tokens": 7, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -27.433046340942383, "logits_per_token": -2.8642992292131697, "logits_per_char": -0.5897086648380055, "num_chars": 34}, {"sum_logits": -37.12602996826172, "num_tokens": 6, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -38.3703727722168, "logits_per_token": -6.187671661376953, "logits_per_char": -0.9770007886384663, "num_chars": 38}, {"sum_logits": -41.002525329589844, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -45.22468566894531, "logits_per_token": -5.1253156661987305, "logits_per_char": -0.87239415594872, "num_chars": 47}, {"sum_logits": -30.96869659423828, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -36.99006271362305, "logits_per_token": -3.4409662882486978, "logits_per_char": -0.5161449432373046, "num_chars": 60}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 92, "native_id": "Mercury_SC_414041", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.00343132019043, "incorrect_loss_raw": 24.0554354985555, "correct_loss_per_char": 0.6786939757210868, "incorrect_loss_per_char": 0.6561810045134099, "correct_loss_per_token": 3.800686264038086, "incorrect_loss_per_token": 3.1176388281363026, "correct_loss_uncond": -13.015176773071289, "incorrect_loss_uncond": -12.235133171081543}, "model_output": [{"sum_logits": -29.537630081176758, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -36.552940368652344, "logits_per_token": -3.281958897908529, "logits_per_char": -0.7773060547678095, "num_chars": 38}, {"sum_logits": -27.616249084472656, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -42.046409606933594, "logits_per_token": -3.0684721204969616, "logits_per_char": -0.6735670508407965, "num_chars": 41}, {"sum_logits": -15.01242733001709, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.272356033325195, "logits_per_token": -3.002485466003418, "logits_per_char": -0.5176699079316238, "num_chars": 29}, {"sum_logits": -19.00343132019043, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -32.01860809326172, "logits_per_token": -3.800686264038086, "logits_per_char": -0.6786939757210868, "num_chars": 28}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 93, "native_id": "MCAS_2014_8_20", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.14444351196289, "incorrect_loss_raw": 26.82884470621745, "correct_loss_per_char": 0.6851009889082476, "incorrect_loss_per_char": 0.46233427266739074, "correct_loss_per_token": 3.7680554389953613, "incorrect_loss_per_token": 2.5892878445712006, "correct_loss_uncond": -10.227306365966797, "incorrect_loss_uncond": -18.405601501464844}, "model_output": [{"sum_logits": -30.14444351196289, "num_tokens": 8, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -40.37174987792969, "logits_per_token": -3.7680554389953613, "logits_per_char": -0.6851009889082476, "num_chars": 44}, {"sum_logits": -27.2562255859375, "num_tokens": 10, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -44.657962799072266, "logits_per_token": -2.72562255859375, "logits_per_char": -0.5047449182581019, "num_chars": 54}, {"sum_logits": -22.343421936035156, "num_tokens": 10, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -41.211551666259766, "logits_per_token": -2.2343421936035157, "logits_per_char": -0.3919898585269326, "num_chars": 57}, {"sum_logits": -30.886886596679688, "num_tokens": 11, "num_tokens_all": 240, "is_greedy": false, "sum_logits_uncond": -49.833824157714844, "logits_per_token": -2.8078987815163354, "logits_per_char": -0.4902680412171379, "num_chars": 63}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 94, "native_id": "Mercury_SC_401116", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.178071975708008, "incorrect_loss_raw": 24.396228154500324, "correct_loss_per_char": 0.3370431661605835, "incorrect_loss_per_char": 0.45323651499852163, "correct_loss_per_token": 1.7975635528564453, "incorrect_loss_per_token": 2.3156136358627166, "correct_loss_uncond": -31.888418197631836, "incorrect_loss_uncond": -18.915590286254883}, "model_output": [{"sum_logits": -14.603801727294922, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -37.44184875488281, "logits_per_token": -1.8254752159118652, "logits_per_char": -0.33190458471124823, "num_chars": 44}, {"sum_logits": -16.178071975708008, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -48.066490173339844, "logits_per_token": -1.7975635528564453, "logits_per_char": -0.3370431661605835, "num_chars": 48}, {"sum_logits": -26.998321533203125, "num_tokens": 12, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -43.01615905761719, "logits_per_token": -2.2498601277669272, "logits_per_char": -0.4736547637404057, "num_chars": 57}, {"sum_logits": -31.58656120300293, "num_tokens": 11, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -49.477447509765625, "logits_per_token": -2.8715055639093574, "logits_per_char": -0.5541501965439111, "num_chars": 57}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 95, "native_id": "Mercury_7064680", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.846134662628174, "incorrect_loss_raw": 9.755708694458008, "correct_loss_per_char": 0.356642484664917, "incorrect_loss_per_char": 0.4893177665182165, "correct_loss_per_token": 1.9615336656570435, "incorrect_loss_per_token": 2.438927173614502, "correct_loss_uncond": -14.926608562469482, "incorrect_loss_uncond": -12.655986150105795}, "model_output": [{"sum_logits": -9.494721412658691, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -22.007068634033203, "logits_per_token": -2.373680353164673, "logits_per_char": -0.5274845229254829, "num_chars": 18}, {"sum_logits": -6.690157890319824, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -19.50213623046875, "logits_per_token": -1.672539472579956, "logits_per_char": -0.37167543835110134, "num_chars": 18}, {"sum_logits": -7.846134662628174, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -22.772743225097656, "logits_per_token": -1.9615336656570435, "logits_per_char": -0.356642484664917, "num_chars": 22}, {"sum_logits": -13.082246780395508, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -25.725879669189453, "logits_per_token": -3.270561695098877, "logits_per_char": -0.5687933382780656, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 96, "native_id": "Mercury_7211680", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 17.06717300415039, "incorrect_loss_raw": 27.066790262858074, "correct_loss_per_char": 0.3631313405138381, "incorrect_loss_per_char": 0.5412895074366798, "correct_loss_per_token": 2.133396625518799, "incorrect_loss_per_token": 3.507410038085211, "correct_loss_uncond": -19.819042205810547, "incorrect_loss_uncond": -13.66672388712565}, "model_output": [{"sum_logits": -29.563800811767578, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.19117736816406, "logits_per_token": -3.6954751014709473, "logits_per_char": -0.5796823688581878, "num_chars": 51}, {"sum_logits": -30.794279098510742, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -45.13163757324219, "logits_per_token": -3.8492848873138428, "logits_per_char": -0.581024133934165, "num_chars": 53}, {"sum_logits": -17.06717300415039, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.88621520996094, "logits_per_token": -2.133396625518799, "logits_per_char": -0.3631313405138381, "num_chars": 47}, {"sum_logits": -20.8422908782959, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -36.87772750854492, "logits_per_token": -2.9774701254708424, "logits_per_char": -0.46316201951768665, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 97, "native_id": "Mercury_180373", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.11823081970215, "incorrect_loss_raw": 21.86202621459961, "correct_loss_per_char": 0.5661947131156921, "incorrect_loss_per_char": 0.6720916126392505, "correct_loss_per_token": 3.6236461639404296, "incorrect_loss_per_token": 3.2161059223154864, "correct_loss_uncond": -11.15362548828125, "incorrect_loss_uncond": -10.427989959716797}, "model_output": [{"sum_logits": -21.619342803955078, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -32.32691955566406, "logits_per_token": -3.088477543422154, "logits_per_char": -0.8647737121582031, "num_chars": 25}, {"sum_logits": -18.11823081970215, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.2718563079834, "logits_per_token": -3.6236461639404296, "logits_per_char": -0.5661947131156921, "num_chars": 32}, {"sum_logits": -18.83978271484375, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -28.12604522705078, "logits_per_token": -3.76795654296875, "logits_per_char": -0.5233272976345487, "num_chars": 36}, {"sum_logits": -25.126953125, "num_tokens": 9, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -36.417083740234375, "logits_per_token": -2.7918836805555554, "logits_per_char": -0.628173828125, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 98, "native_id": "Mercury_7216248", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.620097160339355, "incorrect_loss_raw": 13.23572858174642, "correct_loss_per_char": 0.3584236094826146, "incorrect_loss_per_char": 0.40680192810258053, "correct_loss_per_token": 1.7025121450424194, "incorrect_loss_per_token": 1.735318310046322, "correct_loss_uncond": -14.0541353225708, "incorrect_loss_uncond": -16.518637975056965}, "model_output": [{"sum_logits": -13.399581909179688, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -30.767751693725586, "logits_per_token": -1.9142259870256697, "logits_per_char": -0.44665273030598956, "num_chars": 30}, {"sum_logits": -13.620097160339355, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.674232482910156, "logits_per_token": -1.7025121450424194, "logits_per_char": -0.3584236094826146, "num_chars": 38}, {"sum_logits": -11.612848281860352, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.885818481445312, "logits_per_token": -1.6589783259800501, "logits_per_char": -0.3415543612311868, "num_chars": 34}, {"sum_logits": -14.694755554199219, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -28.609529495239258, "logits_per_token": -1.6327506171332464, "logits_per_char": -0.4321986927705653, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 99, "native_id": "Mercury_SC_417677", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.855381011962891, "incorrect_loss_raw": 21.92103322347005, "correct_loss_per_char": 0.3142152404785156, "incorrect_loss_per_char": 0.6291469997829862, "correct_loss_per_token": 1.5710762023925782, "incorrect_loss_per_token": 2.7857232411702473, "correct_loss_uncond": -15.662593841552734, "incorrect_loss_uncond": -12.795431772867838}, "model_output": [{"sum_logits": -17.808597564697266, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.42894744873047, "logits_per_token": -3.561719512939453, "logits_per_char": -0.8094817074862394, "num_chars": 22}, {"sum_logits": -7.855381011962891, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.517974853515625, "logits_per_token": -1.5710762023925782, "logits_per_char": -0.3142152404785156, "num_chars": 25}, {"sum_logits": -23.593196868896484, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -36.40504455566406, "logits_per_token": -2.3593196868896484, "logits_per_char": -0.5242932637532552, "num_chars": 45}, {"sum_logits": -24.361305236816406, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -38.31540298461914, "logits_per_token": -2.4361305236816406, "logits_per_char": -0.5536660281094637, "num_chars": 44}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 100, "native_id": "Mercury_7221655", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.428779602050781, "incorrect_loss_raw": 19.953506469726562, "correct_loss_per_char": 0.23701771822842685, "incorrect_loss_per_char": 0.49117717467453376, "correct_loss_per_token": 1.7381299336751301, "incorrect_loss_per_token": 2.8505009242466515, "correct_loss_uncond": -12.946504592895508, "incorrect_loss_uncond": -6.560352961222331}, "model_output": [{"sum_logits": -23.7515869140625, "num_tokens": 7, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -28.996719360351562, "logits_per_token": -3.3930838448660716, "logits_per_char": -0.5398087935014204, "num_chars": 44}, {"sum_logits": -16.002212524414062, "num_tokens": 7, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -24.099079132080078, "logits_per_token": -2.2860303606305803, "logits_per_char": -0.39029786644912345, "num_chars": 41}, {"sum_logits": -10.428779602050781, "num_tokens": 6, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -23.37528419494629, "logits_per_token": -1.7381299336751301, "logits_per_char": -0.23701771822842685, "num_chars": 44}, {"sum_logits": -20.106719970703125, "num_tokens": 7, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -26.44577980041504, "logits_per_token": -2.8723885672433034, "logits_per_char": -0.5434248640730575, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 101, "native_id": "MCAS_2006_9_12", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.516674041748047, "incorrect_loss_raw": 22.9234135945638, "correct_loss_per_char": 0.30307036951968547, "incorrect_loss_per_char": 0.6152299074834018, "correct_loss_per_token": 1.4395842552185059, "incorrect_loss_per_token": 2.6578985496803567, "correct_loss_uncond": -34.32450866699219, "incorrect_loss_uncond": -24.925090789794922}, "model_output": [{"sum_logits": -22.501766204833984, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -52.6619987487793, "logits_per_token": -2.500196244981554, "logits_per_char": -0.6081558433738915, "num_chars": 37}, {"sum_logits": -22.32431411743164, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -39.031715393066406, "logits_per_token": -2.4804793463812933, "logits_per_char": -0.5724183107033755, "num_chars": 39}, {"sum_logits": -23.94416046142578, "num_tokens": 8, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -51.85179901123047, "logits_per_token": -2.9930200576782227, "logits_per_char": -0.6651155683729384, "num_chars": 36}, {"sum_logits": -11.516674041748047, "num_tokens": 8, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -45.841182708740234, "logits_per_token": -1.4395842552185059, "logits_per_char": -0.30307036951968547, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 102, "native_id": "MCAS_2004_9_2", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.518793106079102, "incorrect_loss_raw": 15.072900772094727, "correct_loss_per_char": 0.587773613307787, "incorrect_loss_per_char": 0.6028865404552358, "correct_loss_per_token": 2.7037586212158202, "incorrect_loss_per_token": 3.5537866910298668, "correct_loss_uncond": -20.253908157348633, "incorrect_loss_uncond": -17.032506306966145}, "model_output": [{"sum_logits": -13.518793106079102, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -33.772701263427734, "logits_per_token": -2.7037586212158202, "logits_per_char": -0.587773613307787, "num_chars": 23}, {"sum_logits": -20.75560760498047, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -40.43293380737305, "logits_per_token": -5.188901901245117, "logits_per_char": -0.6695357291929184, "num_chars": 31}, {"sum_logits": -11.596784591674805, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -28.51422691345215, "logits_per_token": -2.899196147918701, "logits_per_char": -0.6442658106486002, "num_chars": 18}, {"sum_logits": -12.866310119628906, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -27.369060516357422, "logits_per_token": -2.573262023925781, "logits_per_char": -0.4948580815241887, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 103, "native_id": "Mercury_180005", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.282800674438477, "incorrect_loss_raw": 7.385629971822103, "correct_loss_per_char": 0.3490444819132487, "incorrect_loss_per_char": 0.466136236039419, "correct_loss_per_token": 2.094266891479492, "incorrect_loss_per_token": 2.2684398889541626, "correct_loss_uncond": -17.469993591308594, "incorrect_loss_uncond": -17.928018887837727}, "model_output": [{"sum_logits": -8.018568992614746, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -27.220104217529297, "logits_per_token": -2.672856330871582, "logits_per_char": -0.6682140827178955, "num_chars": 12}, {"sum_logits": -6.282800674438477, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -23.75279426574707, "logits_per_token": -2.094266891479492, "logits_per_char": -0.3490444819132487, "num_chars": 18}, {"sum_logits": -7.174597263336182, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -22.926149368286133, "logits_per_token": -2.3915324211120605, "logits_per_char": -0.3985887368520101, "num_chars": 18}, {"sum_logits": -6.963723659515381, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.794692993164062, "logits_per_token": -1.7409309148788452, "logits_per_char": -0.3316058885483515, "num_chars": 21}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 104, "native_id": "Mercury_7071523", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.039055824279785, "incorrect_loss_raw": 10.27755355834961, "correct_loss_per_char": 0.6274409890174866, "incorrect_loss_per_char": 0.8408579793009725, "correct_loss_per_token": 5.019527912139893, "incorrect_loss_per_token": 5.138776779174805, "correct_loss_uncond": -12.70349407196045, "incorrect_loss_uncond": -8.635618845621744}, "model_output": [{"sum_logits": -9.87558364868164, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -17.115171432495117, "logits_per_token": -4.93779182434082, "logits_per_char": -0.7596602806678185, "num_chars": 13}, {"sum_logits": -10.172674179077148, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -20.34821319580078, "logits_per_token": -5.086337089538574, "logits_per_char": -0.7825133983905499, "num_chars": 13}, {"sum_logits": -10.039055824279785, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -22.742549896240234, "logits_per_token": -5.019527912139893, "logits_per_char": -0.6274409890174866, "num_chars": 16}, {"sum_logits": -10.784402847290039, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -19.276132583618164, "logits_per_token": -5.3922014236450195, "logits_per_char": -0.980400258844549, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 105, "native_id": "Mercury_7263375", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.60885238647461, "incorrect_loss_raw": 20.297935167948406, "correct_loss_per_char": 0.45908389371984143, "incorrect_loss_per_char": 0.642707386475285, "correct_loss_per_token": 1.734316931830512, "incorrect_loss_per_token": 2.7651419513440003, "correct_loss_uncond": -17.636234283447266, "incorrect_loss_uncond": -13.348430951436361}, "model_output": [{"sum_logits": -12.71621036529541, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -30.59539031982422, "logits_per_token": -1.4129122628106012, "logits_per_char": -0.3740061872145709, "num_chars": 34}, {"sum_logits": -15.60885238647461, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -33.245086669921875, "logits_per_token": -1.734316931830512, "logits_per_char": -0.45908389371984143, "num_chars": 34}, {"sum_logits": -23.27008819580078, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -33.308937072753906, "logits_per_token": -3.324298313685826, "logits_per_char": -0.7506480063161542, "num_chars": 31}, {"sum_logits": -24.907506942749023, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.03477096557617, "logits_per_token": -3.5582152775355746, "logits_per_char": -0.8034679658951298, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 106, "native_id": "TIMSS_2011_8_pg102", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.06356143951416, "incorrect_loss_raw": 8.821250279744467, "correct_loss_per_char": 0.40423742930094403, "incorrect_loss_per_char": 1.0070480240715873, "correct_loss_per_token": 3.03178071975708, "incorrect_loss_per_token": 7.395987033843994, "correct_loss_uncond": -11.02340030670166, "incorrect_loss_uncond": -6.458757400512695}, "model_output": [{"sum_logits": -8.551579475402832, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.616113662719727, "logits_per_token": -4.275789737701416, "logits_per_char": -1.068947434425354, "num_chars": 8}, {"sum_logits": -11.028376579284668, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -13.966508865356445, "logits_per_token": -11.028376579284668, "logits_per_char": -1.3785470724105835, "num_chars": 8}, {"sum_logits": -6.883794784545898, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -6.883794784545898, "logits_per_char": -0.5736495653788248, "num_chars": 12}, {"sum_logits": -6.06356143951416, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.08696174621582, "logits_per_token": -3.03178071975708, "logits_per_char": -0.40423742930094403, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 107, "native_id": "Mercury_406550", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.006500244140625, "incorrect_loss_raw": 17.104183197021484, "correct_loss_per_char": 0.5001354217529297, "incorrect_loss_per_char": 0.42553619017327415, "correct_loss_per_token": 2.1824091131036933, "incorrect_loss_per_token": 1.8837838155251962, "correct_loss_uncond": -28.546894073486328, "incorrect_loss_uncond": -24.922379811604817}, "model_output": [{"sum_logits": -18.750473022460938, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -46.672203063964844, "logits_per_token": -2.0833858913845487, "logits_per_char": -0.4934335005910773, "num_chars": 38}, {"sum_logits": -24.006500244140625, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -52.55339431762695, "logits_per_token": -2.1824091131036933, "logits_per_char": -0.5001354217529297, "num_chars": 48}, {"sum_logits": -20.091760635375977, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -43.38407516479492, "logits_per_token": -2.0091760635375975, "logits_per_char": -0.4367774051168691, "num_chars": 46}, {"sum_logits": -12.470315933227539, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.02341079711914, "logits_per_token": -1.5587894916534424, "logits_per_char": -0.34639766481187606, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 108, "native_id": "Mercury_SC_400057", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.494540214538574, "incorrect_loss_raw": 17.409557978312176, "correct_loss_per_char": 0.41978160858154295, "incorrect_loss_per_char": 0.6861675869011731, "correct_loss_per_token": 2.098908042907715, "incorrect_loss_per_token": 3.322853999667698, "correct_loss_uncond": -15.926568031311035, "incorrect_loss_uncond": -12.976809819539389}, "model_output": [{"sum_logits": -14.315183639526367, "num_tokens": 6, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -26.748416900634766, "logits_per_token": -2.385863939921061, "logits_per_char": -0.6223992886750594, "num_chars": 23}, {"sum_logits": -18.7684326171875, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -34.532615661621094, "logits_per_token": -3.7536865234375, "logits_per_char": -0.6703011648995536, "num_chars": 28}, {"sum_logits": -19.145057678222656, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -29.878070831298828, "logits_per_token": -3.8290115356445313, "logits_per_char": -0.7658023071289063, "num_chars": 25}, {"sum_logits": -10.494540214538574, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -26.42110824584961, "logits_per_token": -2.098908042907715, "logits_per_char": -0.41978160858154295, "num_chars": 25}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 109, "native_id": "TAKS_2009_5_26", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.648483276367188, "incorrect_loss_raw": 12.35066032409668, "correct_loss_per_char": 1.9414138793945312, "incorrect_loss_per_char": 1.0641219128413695, "correct_loss_per_token": 11.648483276367188, "incorrect_loss_per_token": 4.688230302598741, "correct_loss_uncond": -0.3662738800048828, "incorrect_loss_uncond": -3.174384435017904}, "model_output": [{"sum_logits": -11.648483276367188, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.01475715637207, "logits_per_token": -11.648483276367188, "logits_per_char": -1.9414138793945312, "num_chars": 6}, {"sum_logits": -14.977250099182129, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -18.80488395690918, "logits_per_token": -4.992416699727376, "logits_per_char": -0.8810147117165958, "num_chars": 17}, {"sum_logits": -11.790547370910645, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.839763641357422, "logits_per_token": -3.930182456970215, "logits_per_char": -0.842181955065046, "num_chars": 14}, {"sum_logits": -10.284183502197266, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.930486679077148, "logits_per_token": -5.142091751098633, "logits_per_char": -1.4691690717424666, "num_chars": 7}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 110, "native_id": "LEAP_2007_8_10417", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.056740760803223, "incorrect_loss_raw": 24.549486478169758, "correct_loss_per_char": 0.3172826516000848, "incorrect_loss_per_char": 0.3646239407971965, "correct_loss_per_token": 1.7223915372576033, "incorrect_loss_per_token": 1.7706867836819076, "correct_loss_uncond": -25.83755397796631, "incorrect_loss_uncond": -32.80721060434977}, "model_output": [{"sum_logits": -12.056740760803223, "num_tokens": 7, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -37.89429473876953, "logits_per_token": -1.7223915372576033, "logits_per_char": -0.3172826516000848, "num_chars": 38}, {"sum_logits": -11.37247371673584, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -41.02680206298828, "logits_per_token": -1.42155921459198, "logits_per_char": -0.2916018901727138, "num_chars": 39}, {"sum_logits": -20.15789031982422, "num_tokens": 13, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -51.55390930175781, "logits_per_token": -1.550606947678786, "logits_per_char": -0.30086403462424205, "num_chars": 67}, {"sum_logits": -42.11809539794922, "num_tokens": 18, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -79.4893798828125, "logits_per_token": -2.3398941887749567, "logits_per_char": -0.5014058975946336, "num_chars": 84}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 111, "native_id": "Mercury_7027405", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.410254001617432, "incorrect_loss_raw": 8.54058043162028, "correct_loss_per_char": 0.801281750202179, "incorrect_loss_per_char": 0.7166519680038013, "correct_loss_per_token": 6.410254001617432, "incorrect_loss_per_token": 5.309188630845812, "correct_loss_uncond": -6.622792720794678, "incorrect_loss_uncond": -8.427261670430502}, "model_output": [{"sum_logits": -6.410254001617432, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -13.03304672241211, "logits_per_token": -6.410254001617432, "logits_per_char": -0.801281750202179, "num_chars": 8}, {"sum_logits": -9.014328002929688, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -14.626222610473633, "logits_per_token": -9.014328002929688, "logits_per_char": -0.8194843639026989, "num_chars": 11}, {"sum_logits": -8.26460075378418, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -19.6718692779541, "logits_per_token": -4.13230037689209, "logits_per_char": -0.688716729482015, "num_chars": 12}, {"sum_logits": -8.342812538146973, "num_tokens": 3, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -16.60543441772461, "logits_per_token": -2.7809375127156577, "logits_per_char": -0.6417548106266902, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 112, "native_id": "Mercury_7058415", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.89219665527344, "incorrect_loss_raw": 26.52558135986328, "correct_loss_per_char": 0.620607484061763, "incorrect_loss_per_char": 0.5555228224086592, "correct_loss_per_token": 3.2892196655273436, "incorrect_loss_per_token": 2.759679921468099, "correct_loss_uncond": -21.458572387695312, "incorrect_loss_uncond": -19.12910334269206}, "model_output": [{"sum_logits": -32.89219665527344, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -54.35076904296875, "logits_per_token": -3.2892196655273436, "logits_per_char": -0.620607484061763, "num_chars": 53}, {"sum_logits": -33.419681549072266, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -54.93766403198242, "logits_per_token": -3.3419681549072267, "logits_per_char": -0.6188829916494863, "num_chars": 54}, {"sum_logits": -17.234180450439453, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -43.36300277709961, "logits_per_token": -1.7234180450439454, "logits_per_char": -0.35904542605082196, "num_chars": 48}, {"sum_logits": -28.922882080078125, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -38.663387298583984, "logits_per_token": -3.213653564453125, "logits_per_char": -0.6886400495256696, "num_chars": 42}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 113, "native_id": "Mercury_7215828", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.661845684051514, "incorrect_loss_raw": 9.18410555521647, "correct_loss_per_char": 0.6965314258228649, "incorrect_loss_per_char": 0.6929131183748932, "correct_loss_per_token": 3.830922842025757, "incorrect_loss_per_token": 3.9364101621839733, "correct_loss_uncond": -10.506836414337158, "incorrect_loss_uncond": -5.543354670206706}, "model_output": [{"sum_logits": -7.661845684051514, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.168682098388672, "logits_per_token": -3.830922842025757, "logits_per_char": -0.6965314258228649, "num_chars": 11}, {"sum_logits": -11.801567077636719, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.15665626525879, "logits_per_token": -3.9338556925455728, "logits_per_char": -0.6942098280962776, "num_chars": 17}, {"sum_logits": -5.716362953186035, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -2.8581814765930176, "logits_per_char": -0.3810908635457357, "num_chars": 15}, {"sum_logits": -10.03438663482666, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.772138595581055, "logits_per_token": -5.01719331741333, "logits_per_char": -1.003438663482666, "num_chars": 10}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 114, "native_id": "Mercury_7064575", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.025627136230469, "incorrect_loss_raw": 14.229814211527506, "correct_loss_per_char": 0.23751650358501233, "incorrect_loss_per_char": 0.39703270933172247, "correct_loss_per_token": 1.5042711893717449, "incorrect_loss_per_token": 2.3716357019212513, "correct_loss_uncond": -18.29623031616211, "incorrect_loss_uncond": -14.089564323425293}, "model_output": [{"sum_logits": -11.477710723876953, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.559673309326172, "logits_per_token": -1.9129517873128254, "logits_per_char": -0.33757972717285156, "num_chars": 34}, {"sum_logits": -9.025627136230469, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.321857452392578, "logits_per_token": -1.5042711893717449, "logits_per_char": -0.23751650358501233, "num_chars": 38}, {"sum_logits": -17.94757080078125, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -31.26613998413086, "logits_per_token": -2.9912618001302085, "logits_per_char": -0.48506948110219594, "num_chars": 37}, {"sum_logits": -13.264161109924316, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.132322311401367, "logits_per_token": -2.2106935183207193, "logits_per_char": -0.3684489197201199, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 115, "native_id": "Mercury_7097493", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.130302429199219, "incorrect_loss_raw": 13.16453504562378, "correct_loss_per_char": 0.21973790349187078, "incorrect_loss_per_char": 0.37987387611752466, "correct_loss_per_token": 1.3550504048665364, "incorrect_loss_per_token": 2.074941495108226, "correct_loss_uncond": -18.15813446044922, "incorrect_loss_uncond": -16.374914010365803}, "model_output": [{"sum_logits": -6.577669620513916, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -25.200069427490234, "logits_per_token": -1.0962782700856526, "logits_per_char": -0.26310678482055666, "num_chars": 25}, {"sum_logits": -17.90332794189453, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -32.30377197265625, "logits_per_token": -2.9838879903157554, "logits_per_char": -0.4475831985473633, "num_chars": 40}, {"sum_logits": -15.01260757446289, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -31.114505767822266, "logits_per_token": -2.14465822492327, "logits_per_char": -0.428931644984654, "num_chars": 35}, {"sum_logits": -8.130302429199219, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.288436889648438, "logits_per_token": -1.3550504048665364, "logits_per_char": -0.21973790349187078, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 116, "native_id": "AKDE&ED_2008_8_47", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.43069839477539, "incorrect_loss_raw": 15.113226254781088, "correct_loss_per_char": 0.44407292958852407, "incorrect_loss_per_char": 0.430905872510559, "correct_loss_per_token": 2.738449732462565, "incorrect_loss_per_token": 2.8368303298950193, "correct_loss_uncond": -17.45256805419922, "incorrect_loss_uncond": -19.13423188527425}, "model_output": [{"sum_logits": -12.290522575378418, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -30.56393051147461, "logits_per_token": -2.4581045150756835, "logits_per_char": -0.4552045398288303, "num_chars": 27}, {"sum_logits": -16.43069839477539, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.88326644897461, "logits_per_token": -2.738449732462565, "logits_per_char": -0.44407292958852407, "num_chars": 37}, {"sum_logits": -16.32581329345703, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -33.358734130859375, "logits_per_token": -3.2651626586914064, "logits_per_char": -0.4296266656172903, "num_chars": 38}, {"sum_logits": -16.723342895507812, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.81970977783203, "logits_per_token": -2.7872238159179688, "logits_per_char": -0.4078864120855564, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 117, "native_id": "Mercury_405136", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.738129615783691, "incorrect_loss_raw": 16.636799494425457, "correct_loss_per_char": 0.3184532403945923, "incorrect_loss_per_char": 0.5862175647065362, "correct_loss_per_token": 1.273812961578369, "incorrect_loss_per_token": 2.2613548778352284, "correct_loss_uncond": -11.759493827819824, "incorrect_loss_uncond": -11.498378117879232}, "model_output": [{"sum_logits": -13.816427230834961, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -29.3967342376709, "logits_per_token": -1.9737753186907088, "logits_per_char": -0.4934438296726772, "num_chars": 28}, {"sum_logits": -16.718402862548828, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -28.20460319519043, "logits_per_token": -2.388343266078404, "logits_per_char": -0.597085816519601, "num_chars": 28}, {"sum_logits": -12.738129615783691, "num_tokens": 10, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.497623443603516, "logits_per_token": -1.273812961578369, "logits_per_char": -0.3184532403945923, "num_chars": 40}, {"sum_logits": -19.375568389892578, "num_tokens": 8, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.804195404052734, "logits_per_token": -2.4219460487365723, "logits_per_char": -0.6681230479273302, "num_chars": 29}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 118, "native_id": "Mercury_415086", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.508854866027832, "incorrect_loss_raw": 7.70802370707194, "correct_loss_per_char": 0.7232060962253146, "incorrect_loss_per_char": 0.8963304360707601, "correct_loss_per_token": 1.627213716506958, "incorrect_loss_per_token": 1.927005926767985, "correct_loss_uncond": -13.268086433410645, "incorrect_loss_uncond": -11.462029774983725}, "model_output": [{"sum_logits": -8.614805221557617, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.142192840576172, "logits_per_token": -2.1537013053894043, "logits_per_char": -1.0768506526947021, "num_chars": 8}, {"sum_logits": -7.484938621520996, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.450685501098633, "logits_per_token": -1.871234655380249, "logits_per_char": -0.8316598468356662, "num_chars": 9}, {"sum_logits": -7.024327278137207, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -18.917282104492188, "logits_per_token": -1.7560818195343018, "logits_per_char": -0.7804808086819119, "num_chars": 9}, {"sum_logits": -6.508854866027832, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.776941299438477, "logits_per_token": -1.627213716506958, "logits_per_char": -0.7232060962253146, "num_chars": 9}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 119, "native_id": "Mercury_7228725", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.902835845947266, "incorrect_loss_raw": 25.089828491210938, "correct_loss_per_char": 0.5275065852146522, "incorrect_loss_per_char": 0.5028875305855921, "correct_loss_per_token": 3.362854480743408, "incorrect_loss_per_token": 3.219325961007012, "correct_loss_uncond": -15.156871795654297, "incorrect_loss_uncond": -13.382975260416666}, "model_output": [{"sum_logits": -20.72803497314453, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -37.98272705078125, "logits_per_token": -3.4546724955240884, "logits_per_char": -0.48204732495684954, "num_chars": 43}, {"sum_logits": -29.966413497924805, "num_tokens": 8, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -42.683746337890625, "logits_per_token": -3.7458016872406006, "logits_per_char": -0.5351145267486572, "num_chars": 56}, {"sum_logits": -26.902835845947266, "num_tokens": 8, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -42.05970764160156, "logits_per_token": -3.362854480743408, "logits_per_char": -0.5275065852146522, "num_chars": 51}, {"sum_logits": -24.575037002563477, "num_tokens": 10, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -34.75193786621094, "logits_per_token": -2.4575037002563476, "logits_per_char": -0.49150074005126954, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 120, "native_id": "Mercury_7201740", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 23.84573745727539, "incorrect_loss_raw": 23.581823348999023, "correct_loss_per_char": 0.5073561161122424, "incorrect_loss_per_char": 0.8014722383115963, "correct_loss_per_token": 2.6495263841417103, "incorrect_loss_per_token": 4.101419339861189, "correct_loss_uncond": -14.088470458984375, "incorrect_loss_uncond": -9.45335578918457}, "model_output": [{"sum_logits": -23.84573745727539, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -37.934207916259766, "logits_per_token": -2.6495263841417103, "logits_per_char": -0.5073561161122424, "num_chars": 47}, {"sum_logits": -20.531658172607422, "num_tokens": 5, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -30.726787567138672, "logits_per_token": -4.106331634521484, "logits_per_char": -0.9332571896639738, "num_chars": 22}, {"sum_logits": -32.284629821777344, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -46.899757385253906, "logits_per_token": -4.612089974539621, "logits_per_char": -0.8071157455444335, "num_chars": 40}, {"sum_logits": -17.929182052612305, "num_tokens": 5, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -21.478992462158203, "logits_per_token": -3.585836410522461, "logits_per_char": -0.6640437797263816, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 121, "native_id": "NYSEDREGENTS_2010_4_4", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.523259162902832, "incorrect_loss_raw": 7.813506126403809, "correct_loss_per_char": 1.074751308986119, "incorrect_loss_per_char": 0.6948412335108197, "correct_loss_per_token": 7.523259162902832, "incorrect_loss_per_token": 4.193521022796631, "correct_loss_uncond": -6.536501884460449, "incorrect_loss_uncond": -8.147274017333984}, "model_output": [{"sum_logits": -7.523259162902832, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -14.059761047363281, "logits_per_token": -7.523259162902832, "logits_per_char": -1.074751308986119, "num_chars": 7}, {"sum_logits": -5.973718643188477, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -13.647649765014648, "logits_per_token": -5.973718643188477, "logits_per_char": -0.8533883775983538, "num_chars": 7}, {"sum_logits": -9.65329360961914, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -18.273910522460938, "logits_per_token": -2.413323402404785, "logits_per_char": -0.5362940894232856, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 122, "native_id": "MEAP_2005_8_21", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.996397018432617, "incorrect_loss_raw": 17.064798672993977, "correct_loss_per_char": 0.6215242437414221, "incorrect_loss_per_char": 0.6324076041914267, "correct_loss_per_token": 3.2851995740618025, "incorrect_loss_per_token": 3.870266252093845, "correct_loss_uncond": -17.782411575317383, "incorrect_loss_uncond": -12.82016658782959}, "model_output": [{"sum_logits": -20.988739013671875, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.608901977539062, "logits_per_token": -5.247184753417969, "logits_per_char": -0.9540335915305398, "num_chars": 22}, {"sum_logits": -14.253602981567383, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -31.809791564941406, "logits_per_token": -2.375600496927897, "logits_per_char": -0.45979364456668975, "num_chars": 31}, {"sum_logits": -15.952054023742676, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.236202239990234, "logits_per_token": -3.988013505935669, "logits_per_char": -0.4833955764770508, "num_chars": 33}, {"sum_logits": -22.996397018432617, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -40.77880859375, "logits_per_token": -3.2851995740618025, "logits_per_char": -0.6215242437414221, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 123, "native_id": "Mercury_7026355", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.408027648925781, "incorrect_loss_raw": 31.4579340616862, "correct_loss_per_char": 0.35018244656649505, "incorrect_loss_per_char": 0.5505154462659347, "correct_loss_per_token": 1.7120030721028645, "incorrect_loss_per_token": 2.4561521730222906, "correct_loss_uncond": -19.558116912841797, "incorrect_loss_uncond": -12.399398803710938}, "model_output": [{"sum_logits": -15.408027648925781, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -34.96614456176758, "logits_per_token": -1.7120030721028645, "logits_per_char": -0.35018244656649505, "num_chars": 44}, {"sum_logits": -34.5161018371582, "num_tokens": 13, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -42.09563064575195, "logits_per_token": -2.6550847567044773, "logits_per_char": -0.6767863105325138, "num_chars": 51}, {"sum_logits": -29.817909240722656, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -38.89289093017578, "logits_per_token": -2.710719021883878, "logits_per_char": -0.5053882922156382, "num_chars": 59}, {"sum_logits": -30.039791107177734, "num_tokens": 15, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -50.58347702026367, "logits_per_token": -2.0026527404785157, "logits_per_char": -0.4693717360496521, "num_chars": 64}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 124, "native_id": "Mercury_7249708", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 39.55384063720703, "incorrect_loss_raw": 31.917832056681316, "correct_loss_per_char": 0.6704040785967293, "incorrect_loss_per_char": 0.5343922338325543, "correct_loss_per_token": 3.2961533864339194, "incorrect_loss_per_token": 2.9016210960619375, "correct_loss_uncond": -11.792884826660156, "incorrect_loss_uncond": -11.444161097208658}, "model_output": [{"sum_logits": -28.211925506591797, "num_tokens": 11, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -41.41246795654297, "logits_per_token": -2.5647205005992544, "logits_per_char": -0.4550310565579322, "num_chars": 62}, {"sum_logits": -39.55384063720703, "num_tokens": 12, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -51.34672546386719, "logits_per_token": -3.2961533864339194, "logits_per_char": -0.6704040785967293, "num_chars": 59}, {"sum_logits": -31.98335075378418, "num_tokens": 11, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -43.268070220947266, "logits_per_token": -2.9075773412531074, "logits_per_char": -0.5243172254718718, "num_chars": 61}, {"sum_logits": -35.55821990966797, "num_tokens": 11, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -45.40544128417969, "logits_per_token": -3.2325654463334517, "logits_per_char": -0.623828419467859, "num_chars": 57}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 125, "native_id": "Mercury_7107170", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.82841968536377, "incorrect_loss_raw": 17.45380465189616, "correct_loss_per_char": 0.25224056243896487, "incorrect_loss_per_char": 0.6000114902101382, "correct_loss_per_token": 1.2612028121948242, "incorrect_loss_per_token": 3.268649790022108, "correct_loss_uncond": -20.857871055603027, "incorrect_loss_uncond": -15.250250498453775}, "model_output": [{"sum_logits": -17.0119686126709, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -29.276884078979492, "logits_per_token": -4.252992153167725, "logits_per_char": -0.7088320255279541, "num_chars": 24}, {"sum_logits": -21.127531051635742, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -36.72533416748047, "logits_per_token": -3.5212551752726235, "logits_per_char": -0.6602353453636169, "num_chars": 32}, {"sum_logits": -14.221914291381836, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.109947204589844, "logits_per_token": -2.0317020416259766, "logits_per_char": -0.4309670997388435, "num_chars": 33}, {"sum_logits": -8.82841968536377, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.686290740966797, "logits_per_token": -1.2612028121948242, "logits_per_char": -0.25224056243896487, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 126, "native_id": "Mercury_183820", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.159750461578369, "incorrect_loss_raw": 2.302737553914388, "correct_loss_per_char": 0.6319500923156738, "incorrect_loss_per_char": 0.27564679251776797, "correct_loss_per_token": 3.159750461578369, "incorrect_loss_per_token": 2.302737553914388, "correct_loss_uncond": -9.147002696990967, "incorrect_loss_uncond": -9.676117579142252}, "model_output": [{"sum_logits": -3.159750461578369, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.306753158569336, "logits_per_token": -3.159750461578369, "logits_per_char": -0.6319500923156738, "num_chars": 5}, {"sum_logits": -1.869877576828003, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -11.02944564819336, "logits_per_token": -1.869877576828003, "logits_per_char": -0.26712536811828613, "num_chars": 7}, {"sum_logits": -3.019597053527832, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -13.184402465820312, "logits_per_token": -3.019597053527832, "logits_per_char": -0.3355107837253147, "num_chars": 9}, {"sum_logits": -2.018738031387329, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -11.72271728515625, "logits_per_token": -2.018738031387329, "logits_per_char": -0.22430422570970324, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 127, "native_id": "Mercury_SC_401357", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 1.037150502204895, "incorrect_loss_raw": 2.180341442426046, "correct_loss_per_char": 0.08642920851707458, "incorrect_loss_per_char": 0.1596412486209101, "correct_loss_per_token": 1.037150502204895, "incorrect_loss_per_token": 1.7581700086593628, "correct_loss_uncond": -13.220250010490417, "incorrect_loss_uncond": -12.990280747413635}, "model_output": [{"sum_logits": -2.130805492401123, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -2.130805492401123, "logits_per_char": -0.16390811480008638, "num_chars": 13}, {"sum_logits": -1.037150502204895, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": true, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -1.037150502204895, "logits_per_char": -0.08642920851707458, "num_chars": 12}, {"sum_logits": -1.8771902322769165, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.193122863769531, "logits_per_token": -1.8771902322769165, "logits_per_char": -0.13408501659120833, "num_chars": 14}, {"sum_logits": -2.5330286026000977, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.497363090515137, "logits_per_token": -1.2665143013000488, "logits_per_char": -0.18093061447143555, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 128, "native_id": "NYSEDREGENTS_2008_8_11", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 5.518895149230957, "incorrect_loss_raw": 5.328778028488159, "correct_loss_per_char": 0.3679263432820638, "incorrect_loss_per_char": 0.6081614289964948, "correct_loss_per_token": 2.7594475746154785, "incorrect_loss_per_token": 4.716072916984558, "correct_loss_uncond": -15.671637535095215, "incorrect_loss_uncond": -7.55660621325175}, "model_output": [{"sum_logits": -5.644294738769531, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -5.644294738769531, "logits_per_char": -0.6271438598632812, "num_chars": 9}, {"sum_logits": -6.66580867767334, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -6.66580867767334, "logits_per_char": -0.9522583825247628, "num_chars": 7}, {"sum_logits": -3.6762306690216064, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -1.8381153345108032, "logits_per_char": -0.24508204460144042, "num_chars": 15}, {"sum_logits": -5.518895149230957, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -21.190532684326172, "logits_per_token": -2.7594475746154785, "logits_per_char": -0.3679263432820638, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 129, "native_id": "Mercury_416650", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.510123252868652, "incorrect_loss_raw": 10.27734931310018, "correct_loss_per_char": 0.4811585866487943, "incorrect_loss_per_char": 0.4868728894796985, "correct_loss_per_token": 2.5020246505737305, "incorrect_loss_per_token": 3.0295161406199136, "correct_loss_uncond": -15.523091316223145, "incorrect_loss_uncond": -14.599332968393961}, "model_output": [{"sum_logits": -6.182159900665283, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -21.078065872192383, "logits_per_token": -2.0607199668884277, "logits_per_char": -0.343453327814738, "num_chars": 18}, {"sum_logits": -10.38427734375, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.600831985473633, "logits_per_token": -3.46142578125, "logits_per_char": -0.5465409128289473, "num_chars": 19}, {"sum_logits": -12.510123252868652, "num_tokens": 5, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -28.033214569091797, "logits_per_token": -2.5020246505737305, "logits_per_char": -0.4811585866487943, "num_chars": 26}, {"sum_logits": -14.265610694885254, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -29.951148986816406, "logits_per_token": -3.5664026737213135, "logits_per_char": -0.5706244277954101, "num_chars": 25}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 130, "native_id": "NCEOGA_2013_5_20", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.053281784057617, "incorrect_loss_raw": 6.581985155741374, "correct_loss_per_char": 1.0957528894597834, "incorrect_loss_per_char": 0.7580248488320245, "correct_loss_per_token": 6.026640892028809, "incorrect_loss_per_token": 2.8886868953704834, "correct_loss_uncond": -6.99669075012207, "incorrect_loss_uncond": -10.215126355489096}, "model_output": [{"sum_logits": -7.241502285003662, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.300458908081055, "logits_per_token": -2.4138340950012207, "logits_per_char": -0.8046113650004069, "num_chars": 9}, {"sum_logits": -12.053281784057617, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -19.049972534179688, "logits_per_token": -6.026640892028809, "logits_per_char": -1.0957528894597834, "num_chars": 11}, {"sum_logits": -3.7437386512756348, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.7568359375, "logits_per_token": -1.8718693256378174, "logits_per_char": -0.3743738651275635, "num_chars": 10}, {"sum_logits": -8.760714530944824, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.33403968811035, "logits_per_token": -4.380357265472412, "logits_per_char": -1.095089316368103, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 131, "native_id": "Mercury_400500", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.112171173095703, "incorrect_loss_raw": 18.630598068237305, "correct_loss_per_char": 0.7244868469238281, "incorrect_loss_per_char": 0.8168377398661596, "correct_loss_per_token": 3.6224342346191407, "incorrect_loss_per_token": 3.726119613647461, "correct_loss_uncond": -7.378746032714844, "incorrect_loss_uncond": -6.484890619913737}, "model_output": [{"sum_logits": -19.963560104370117, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -24.921342849731445, "logits_per_token": -3.9927120208740234, "logits_per_char": -0.9506457192557198, "num_chars": 21}, {"sum_logits": -19.865798950195312, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -27.31316375732422, "logits_per_token": -3.9731597900390625, "logits_per_char": -0.9459904261997768, "num_chars": 21}, {"sum_logits": -18.112171173095703, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -25.490917205810547, "logits_per_token": -3.6224342346191407, "logits_per_char": -0.7244868469238281, "num_chars": 25}, {"sum_logits": -16.062435150146484, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -23.11195945739746, "logits_per_token": -3.2124870300292967, "logits_per_char": -0.5538770741429823, "num_chars": 29}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 132, "native_id": "Mercury_SC_401366", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.741025924682617, "incorrect_loss_raw": 16.8565731048584, "correct_loss_per_char": 0.39840610607250315, "incorrect_loss_per_char": 0.76582055078157, "correct_loss_per_token": 2.4568376541137695, "incorrect_loss_per_token": 2.9671263588799373, "correct_loss_uncond": -13.70484733581543, "incorrect_loss_uncond": -12.299125035603842}, "model_output": [{"sum_logits": -14.19277572631836, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -27.349971771240234, "logits_per_token": -2.8385551452636717, "logits_per_char": -0.8348691603716683, "num_chars": 17}, {"sum_logits": -17.90005874633789, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.379638671875, "logits_per_token": -2.9833431243896484, "logits_per_char": -0.7782634237538213, "num_chars": 23}, {"sum_logits": -18.476884841918945, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -30.737483978271484, "logits_per_token": -3.0794808069864907, "logits_per_char": -0.6843290682192202, "num_chars": 27}, {"sum_logits": -14.741025924682617, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -28.445873260498047, "logits_per_token": -2.4568376541137695, "logits_per_char": -0.39840610607250315, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 133, "native_id": "Mercury_7141610", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.357433795928955, "incorrect_loss_raw": 9.87848695119222, "correct_loss_per_char": 0.47963339941842215, "incorrect_loss_per_char": 0.87369458326567, "correct_loss_per_token": 3.357433795928955, "incorrect_loss_per_token": 5.697073300679524, "correct_loss_uncond": -9.651267528533936, "incorrect_loss_uncond": -6.295892397562663}, "model_output": [{"sum_logits": -4.546978950500488, "num_tokens": 1, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -12.611124038696289, "logits_per_token": -4.546978950500488, "logits_per_char": -0.9093957901000976, "num_chars": 5}, {"sum_logits": -3.357433795928955, "num_tokens": 1, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -13.00870132446289, "logits_per_token": -3.357433795928955, "logits_per_char": -0.47963339941842215, "num_chars": 7}, {"sum_logits": -13.033193588256836, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -20.179183959960938, "logits_per_token": -6.516596794128418, "logits_per_char": -1.0025533529428334, "num_chars": 13}, {"sum_logits": -12.055288314819336, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -15.732830047607422, "logits_per_token": -6.027644157409668, "logits_per_char": -0.7091346067540786, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 134, "native_id": "Mercury_7247013", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 30.90003204345703, "incorrect_loss_raw": 30.21015230814616, "correct_loss_per_char": 0.6574474902863198, "incorrect_loss_per_char": 0.5769043357283982, "correct_loss_per_token": 2.8090938221324575, "incorrect_loss_per_token": 3.0112088969630055, "correct_loss_uncond": -12.0955810546875, "incorrect_loss_uncond": -10.549386978149414}, "model_output": [{"sum_logits": -22.64834976196289, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.946910858154297, "logits_per_token": -2.8310437202453613, "logits_per_char": -0.5662087440490723, "num_chars": 40}, {"sum_logits": -30.90003204345703, "num_tokens": 11, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -42.99561309814453, "logits_per_token": -2.8090938221324575, "logits_per_char": -0.6574474902863198, "num_chars": 47}, {"sum_logits": -28.465810775756836, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -38.72679138183594, "logits_per_token": -3.162867863972982, "logits_per_char": -0.527144643995497, "num_chars": 54}, {"sum_logits": -39.51629638671875, "num_tokens": 13, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -51.604915618896484, "logits_per_token": -3.039715106670673, "logits_per_char": -0.637359619140625, "num_chars": 62}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 135, "native_id": "NYSEDREGENTS_2008_8_30", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.739696025848389, "incorrect_loss_raw": 16.53360652923584, "correct_loss_per_char": 0.17936550080776215, "incorrect_loss_per_char": 0.575503257128385, "correct_loss_per_token": 0.9566160043080648, "incorrect_loss_per_token": 3.4009331385294597, "correct_loss_uncond": -27.74229860305786, "incorrect_loss_uncond": -11.305551846822103}, "model_output": [{"sum_logits": -16.894317626953125, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -33.59734344482422, "logits_per_token": -2.8157196044921875, "logits_per_char": -0.5119490189985796, "num_chars": 33}, {"sum_logits": -5.739696025848389, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -33.48199462890625, "logits_per_token": -0.9566160043080648, "logits_per_char": -0.17936550080776215, "num_chars": 32}, {"sum_logits": -15.790913581848145, "num_tokens": 5, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -24.46593475341797, "logits_per_token": -3.158182716369629, "logits_per_char": -0.5639611993517194, "num_chars": 28}, {"sum_logits": -16.91558837890625, "num_tokens": 4, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -25.45419692993164, "logits_per_token": -4.2288970947265625, "logits_per_char": -0.6505995530348557, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 136, "native_id": "ACTAAP_2011_5_16", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.272119522094727, "incorrect_loss_raw": 18.21969223022461, "correct_loss_per_char": 0.46967164675394696, "incorrect_loss_per_char": 0.7390777562374256, "correct_loss_per_token": 2.254423904418945, "incorrect_loss_per_token": 4.268351586659749, "correct_loss_uncond": -14.178140640258789, "incorrect_loss_uncond": -8.35095469156901}, "model_output": [{"sum_logits": -22.915576934814453, "num_tokens": 4, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -26.649738311767578, "logits_per_token": -5.728894233703613, "logits_per_char": -0.9963294319484545, "num_chars": 23}, {"sum_logits": -11.272119522094727, "num_tokens": 5, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -25.450260162353516, "logits_per_token": -2.254423904418945, "logits_per_char": -0.46967164675394696, "num_chars": 24}, {"sum_logits": -17.19428825378418, "num_tokens": 5, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -26.843595504760742, "logits_per_token": -3.438857650756836, "logits_per_char": -0.6613187789916992, "num_chars": 26}, {"sum_logits": -14.549211502075195, "num_tokens": 4, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -26.21860694885254, "logits_per_token": -3.637302875518799, "logits_per_char": -0.5595850577721229, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 137, "native_id": "Mercury_7093153", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 20.19031524658203, "incorrect_loss_raw": 16.256805419921875, "correct_loss_per_char": 0.5047578811645508, "incorrect_loss_per_char": 0.42023131889209414, "correct_loss_per_token": 2.019031524658203, "incorrect_loss_per_token": 1.5381442400742864, "correct_loss_uncond": -22.335277557373047, "incorrect_loss_uncond": -16.53619639078776}, "model_output": [{"sum_logits": -15.42789077758789, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.794086456298828, "logits_per_token": -1.7142100863986545, "logits_per_char": -0.4059971257259971, "num_chars": 38}, {"sum_logits": -20.19031524658203, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -42.52559280395508, "logits_per_token": -2.019031524658203, "logits_per_char": -0.5047578811645508, "num_chars": 40}, {"sum_logits": -16.061607360839844, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -33.33020782470703, "logits_per_token": -1.4601461237127131, "logits_per_char": -0.42267387791683797, "num_chars": 38}, {"sum_logits": -17.28091812133789, "num_tokens": 12, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -34.25471115112305, "logits_per_token": -1.440076510111491, "logits_per_char": -0.4320229530334473, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 138, "native_id": "Mercury_7013965", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.8253173828125, "incorrect_loss_raw": 20.817276000976562, "correct_loss_per_char": 0.4395921495225694, "incorrect_loss_per_char": 0.4814754287683654, "correct_loss_per_token": 2.6375528971354165, "incorrect_loss_per_token": 2.7522155216762, "correct_loss_uncond": -18.413921356201172, "incorrect_loss_uncond": -13.508382161458334}, "model_output": [{"sum_logits": -15.8253173828125, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -34.23923873901367, "logits_per_token": -2.6375528971354165, "logits_per_char": -0.4395921495225694, "num_chars": 36}, {"sum_logits": -20.454730987548828, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -32.99524688720703, "logits_per_token": -2.5568413734436035, "logits_per_char": -0.4870174044654483, "num_chars": 42}, {"sum_logits": -25.20941162109375, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -33.24287414550781, "logits_per_token": -3.601344517299107, "logits_per_char": -0.6002240862165179, "num_chars": 42}, {"sum_logits": -16.78768539428711, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -36.738853454589844, "logits_per_token": -2.0984606742858887, "logits_per_char": -0.35718479562313, "num_chars": 47}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 139, "native_id": "Mercury_7034843", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.046280860900879, "incorrect_loss_raw": 6.2113931973775225, "correct_loss_per_char": 0.7528925538063049, "incorrect_loss_per_char": 0.4512032137976752, "correct_loss_per_token": 4.015426953633626, "incorrect_loss_per_token": 3.878276507059733, "correct_loss_uncond": -5.568085670471191, "incorrect_loss_uncond": -9.388888200124105}, "model_output": [{"sum_logits": -7.172591686248779, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -18.691387176513672, "logits_per_token": -2.3908638954162598, "logits_per_char": -0.39847731590270996, "num_chars": 18}, {"sum_logits": -12.046280860900879, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -17.61436653137207, "logits_per_token": -4.015426953633626, "logits_per_char": -0.7528925538063049, "num_chars": 16}, {"sum_logits": -4.435244560241699, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -15.568906784057617, "logits_per_token": -2.2176222801208496, "logits_per_char": -0.3696037133534749, "num_chars": 12}, {"sum_logits": -7.02634334564209, "num_tokens": 1, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -12.540550231933594, "logits_per_token": -7.02634334564209, "logits_per_char": -0.5855286121368408, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 140, "native_id": "Mercury_SC_407610", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.411369323730469, "incorrect_loss_raw": 18.879069646199543, "correct_loss_per_char": 0.35025839372114703, "incorrect_loss_per_char": 0.5226243590107384, "correct_loss_per_token": 1.5411369323730468, "incorrect_loss_per_token": 2.344740147313113, "correct_loss_uncond": -22.516937255859375, "incorrect_loss_uncond": -15.03598149617513}, "model_output": [{"sum_logits": -15.690982818603516, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.06559753417969, "logits_per_token": -2.2415689740862166, "logits_per_char": -0.5061607360839844, "num_chars": 31}, {"sum_logits": -17.501096725463867, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.96549415588379, "logits_per_token": -2.1876370906829834, "logits_per_char": -0.46055517698589127, "num_chars": 38}, {"sum_logits": -23.44512939453125, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -35.71406173706055, "logits_per_token": -2.605014377170139, "logits_per_char": -0.6011571639623398, "num_chars": 39}, {"sum_logits": -15.411369323730469, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -37.928306579589844, "logits_per_token": -1.5411369323730468, "logits_per_char": -0.35025839372114703, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 141, "native_id": "Mercury_405947", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.852787017822266, "incorrect_loss_raw": 20.469378153483074, "correct_loss_per_char": 0.3743996491303315, "incorrect_loss_per_char": 0.5521480454338922, "correct_loss_per_token": 1.9789695739746094, "incorrect_loss_per_token": 2.9203891337863985, "correct_loss_uncond": -18.87584686279297, "incorrect_loss_uncond": -12.500540415445963}, "model_output": [{"sum_logits": -19.13970184326172, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.903385162353516, "logits_per_token": -3.1899503072102866, "logits_per_char": -0.6379900614420573, "num_chars": 30}, {"sum_logits": -16.109128952026367, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -32.76145935058594, "logits_per_token": -2.301304136003767, "logits_per_char": -0.42392444610595703, "num_chars": 38}, {"sum_logits": -13.852787017822266, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -32.728633880615234, "logits_per_token": -1.9789695739746094, "logits_per_char": -0.3743996491303315, "num_chars": 37}, {"sum_logits": -26.159303665161133, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -37.244911193847656, "logits_per_token": -3.2699129581451416, "logits_per_char": -0.5945296287536621, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 142, "native_id": "AKDE&ED_2012_8_6", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.308034896850586, "incorrect_loss_raw": 12.747750600179037, "correct_loss_per_char": 0.34616812070210773, "incorrect_loss_per_char": 0.4838050641710796, "correct_loss_per_token": 2.0770087242126465, "incorrect_loss_per_token": 2.6617921352386475, "correct_loss_uncond": -18.174823760986328, "incorrect_loss_uncond": -14.003392537434896}, "model_output": [{"sum_logits": -8.308034896850586, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -26.482858657836914, "logits_per_token": -2.0770087242126465, "logits_per_char": -0.34616812070210773, "num_chars": 24}, {"sum_logits": -14.378241539001465, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -31.024578094482422, "logits_per_token": -2.875648307800293, "logits_per_char": -0.5990933974583944, "num_chars": 24}, {"sum_logits": -13.586716651916504, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -28.196914672851562, "logits_per_token": -3.396679162979126, "logits_per_char": -0.4852398804255894, "num_chars": 28}, {"sum_logits": -10.27829360961914, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -21.031936645507812, "logits_per_token": -1.7130489349365234, "logits_per_char": -0.36708191462925505, "num_chars": 28}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 143, "native_id": "Mercury_7011130", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.58361053466797, "incorrect_loss_raw": 22.11931610107422, "correct_loss_per_char": 0.5895902633666992, "incorrect_loss_per_char": 0.5624000378144093, "correct_loss_per_token": 2.947951316833496, "incorrect_loss_per_token": 2.7649145126342773, "correct_loss_uncond": -7.762481689453125, "incorrect_loss_uncond": -9.353625615437826}, "model_output": [{"sum_logits": -23.58361053466797, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -31.346092224121094, "logits_per_token": -2.947951316833496, "logits_per_char": -0.5895902633666992, "num_chars": 40}, {"sum_logits": -22.285755157470703, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -30.692855834960938, "logits_per_token": -2.785719394683838, "logits_per_char": -0.5571438789367675, "num_chars": 40}, {"sum_logits": -23.208988189697266, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -32.70244598388672, "logits_per_token": -2.901123523712158, "logits_per_char": -0.5951022612742889, "num_chars": 39}, {"sum_logits": -20.863204956054688, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -31.023523330688477, "logits_per_token": -2.607900619506836, "logits_per_char": -0.5349539732321714, "num_chars": 39}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 144, "native_id": "Mercury_LBS11022", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.748884201049805, "incorrect_loss_raw": 14.119511604309082, "correct_loss_per_char": 0.416613533383324, "incorrect_loss_per_char": 1.0890149146791488, "correct_loss_per_token": 2.916294733683268, "incorrect_loss_per_token": 5.464426676432292, "correct_loss_uncond": -13.27003288269043, "incorrect_loss_uncond": -6.879879951477051}, "model_output": [{"sum_logits": -8.748884201049805, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -22.018917083740234, "logits_per_token": -2.916294733683268, "logits_per_char": -0.416613533383324, "num_chars": 21}, {"sum_logits": -6.141283988952637, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.569408416748047, "logits_per_token": -3.0706419944763184, "logits_per_char": -0.43866314206804546, "num_chars": 14}, {"sum_logits": -19.143949508666992, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -23.627168655395508, "logits_per_token": -4.785987377166748, "logits_per_char": -1.2762633005777995, "num_chars": 15}, {"sum_logits": -17.073301315307617, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.801597595214844, "logits_per_token": -8.536650657653809, "logits_per_char": -1.5521183013916016, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 145, "native_id": "TIMSS_1995_8_J1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 61.65026092529297, "incorrect_loss_raw": 51.495592753092446, "correct_loss_per_char": 0.5927909704355093, "incorrect_loss_per_char": 0.5617435659527832, "correct_loss_per_token": 3.4250144958496094, "incorrect_loss_per_token": 3.000704377616932, "correct_loss_uncond": -11.57611083984375, "incorrect_loss_uncond": -14.092681884765625}, "model_output": [{"sum_logits": -50.128570556640625, "num_tokens": 20, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -69.6241455078125, "logits_per_token": -2.506428527832031, "logits_per_char": -0.43972430312842653, "num_chars": 114}, {"sum_logits": -45.51322937011719, "num_tokens": 15, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -61.30805206298828, "logits_per_token": -3.034215291341146, "logits_per_char": -0.5988582811857525, "num_chars": 76}, {"sum_logits": -61.65026092529297, "num_tokens": 18, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -73.22637176513672, "logits_per_token": -3.4250144958496094, "logits_per_char": -0.5927909704355093, "num_chars": 104}, {"sum_logits": -58.84497833251953, "num_tokens": 17, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -65.83262634277344, "logits_per_token": -3.4614693136776196, "logits_per_char": -0.6466481135441706, "num_chars": 91}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 146, "native_id": "Mercury_SC_408366", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.193111419677734, "incorrect_loss_raw": 5.816364049911499, "correct_loss_per_char": 0.9193111419677734, "incorrect_loss_per_char": 0.6713396809317849, "correct_loss_per_token": 9.193111419677734, "incorrect_loss_per_token": 3.5616705417633057, "correct_loss_uncond": -5.259525299072266, "incorrect_loss_uncond": -9.203137318293253}, "model_output": [{"sum_logits": -3.920931100845337, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -12.802438735961914, "logits_per_token": -3.920931100845337, "logits_per_char": -0.7841862201690674, "num_chars": 5}, {"sum_logits": -9.193111419677734, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -14.45263671875, "logits_per_token": -9.193111419677734, "logits_per_char": -0.9193111419677734, "num_chars": 10}, {"sum_logits": -4.487418174743652, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.462947845458984, "logits_per_token": -2.243709087371826, "logits_per_char": -0.4079471067948775, "num_chars": 11}, {"sum_logits": -9.040742874145508, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -16.79311752319336, "logits_per_token": -4.520371437072754, "logits_per_char": -0.8218857158314098, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 147, "native_id": "Mercury_7009993", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.561150550842285, "incorrect_loss_raw": 17.804642995198567, "correct_loss_per_char": 0.40519840486588016, "incorrect_loss_per_char": 0.5408896951699361, "correct_loss_per_token": 2.0935250918070474, "incorrect_loss_per_token": 2.5381724796597918, "correct_loss_uncond": -18.87836742401123, "incorrect_loss_uncond": -18.004619598388672}, "model_output": [{"sum_logits": -12.561150550842285, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.439517974853516, "logits_per_token": -2.0935250918070474, "logits_per_char": -0.40519840486588016, "num_chars": 31}, {"sum_logits": -23.57715606689453, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -38.974395751953125, "logits_per_token": -2.9471445083618164, "logits_per_char": -0.6934457666733685, "num_chars": 34}, {"sum_logits": -17.00902557373047, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.07190704345703, "logits_per_token": -2.8348375956217446, "logits_per_char": -0.5154250173857717, "num_chars": 33}, {"sum_logits": -12.827747344970703, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.38148498535156, "logits_per_token": -1.8325353349958147, "logits_per_char": -0.4137983014506678, "num_chars": 31}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 148, "native_id": "Mercury_401699", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.958364725112915, "incorrect_loss_raw": 2.83713428179423, "correct_loss_per_char": 0.9791823625564575, "incorrect_loss_per_char": 1.418567140897115, "correct_loss_per_token": 1.958364725112915, "incorrect_loss_per_token": 2.83713428179423, "correct_loss_uncond": -4.463186979293823, "incorrect_loss_uncond": -3.2481656869252524}, "model_output": [{"sum_logits": -3.3319644927978516, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -5.551112174987793, "logits_per_token": -3.3319644927978516, "logits_per_char": -1.6659822463989258, "num_chars": 2}, {"sum_logits": -2.553847551345825, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -6.237893581390381, "logits_per_token": -2.553847551345825, "logits_per_char": -1.2769237756729126, "num_chars": 2}, {"sum_logits": -1.958364725112915, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -6.421551704406738, "logits_per_token": -1.958364725112915, "logits_per_char": -0.9791823625564575, "num_chars": 2}, {"sum_logits": -2.6255908012390137, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -6.466894149780273, "logits_per_token": -2.6255908012390137, "logits_per_char": -1.3127954006195068, "num_chars": 2}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 149, "native_id": "Mercury_7056858", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.5075128078460693, "incorrect_loss_raw": 4.27146299680074, "correct_loss_per_char": 0.4179188013076782, "incorrect_loss_per_char": 0.6799835602442424, "correct_loss_per_token": 2.5075128078460693, "incorrect_loss_per_token": 4.27146299680074, "correct_loss_uncond": -9.952556848526001, "incorrect_loss_uncond": -8.70481276512146}, "model_output": [{"sum_logits": -2.5075128078460693, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -12.46006965637207, "logits_per_token": -2.5075128078460693, "logits_per_char": -0.4179188013076782, "num_chars": 6}, {"sum_logits": -4.362675189971924, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -13.088691711425781, "logits_per_token": -4.362675189971924, "logits_per_char": -0.7271125316619873, "num_chars": 6}, {"sum_logits": -3.4183189868927, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -12.747220993041992, "logits_per_token": -3.4183189868927, "logits_per_char": -0.68366379737854, "num_chars": 5}, {"sum_logits": -5.033394813537598, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -13.092914581298828, "logits_per_token": -5.033394813537598, "logits_per_char": -0.6291743516921997, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 150, "native_id": "Mercury_7027160", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.586128234863281, "incorrect_loss_raw": 4.1233930587768555, "correct_loss_per_char": 0.5724085489908854, "incorrect_loss_per_char": 0.36866029079514323, "correct_loss_per_token": 4.293064117431641, "incorrect_loss_per_token": 2.0616965293884277, "correct_loss_uncond": -6.316595077514648, "incorrect_loss_uncond": -11.530314127604166}, "model_output": [{"sum_logits": -5.168703079223633, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -15.233770370483398, "logits_per_token": -2.5843515396118164, "logits_per_char": -0.4698820981112393, "num_chars": 11}, {"sum_logits": -5.0429301261901855, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -15.62195873260498, "logits_per_token": -2.5214650630950928, "logits_per_char": -0.42024417718251544, "num_chars": 12}, {"sum_logits": -8.586128234863281, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -14.90272331237793, "logits_per_token": -4.293064117431641, "logits_per_char": -0.5724085489908854, "num_chars": 15}, {"sum_logits": -2.158545970916748, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -16.105392456054688, "logits_per_token": -1.079272985458374, "logits_per_char": -0.21585459709167482, "num_chars": 10}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 151, "native_id": "Mercury_400811", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 2.7964682579040527, "incorrect_loss_raw": 3.9478302001953125, "correct_loss_per_char": 0.1747792661190033, "incorrect_loss_per_char": 0.43476705197934756, "correct_loss_per_token": 2.7964682579040527, "incorrect_loss_per_token": 3.9478302001953125, "correct_loss_uncond": -10.18227243423462, "incorrect_loss_uncond": -8.449557622273764}, "model_output": [{"sum_logits": -1.1177492141723633, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": true, "sum_logits_uncond": -11.723394393920898, "logits_per_token": -1.1177492141723633, "logits_per_char": -0.22354984283447266, "num_chars": 5}, {"sum_logits": -5.231353759765625, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -12.414356231689453, "logits_per_token": -5.231353759765625, "logits_per_char": -0.5812615288628472, "num_chars": 9}, {"sum_logits": -5.494387626647949, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -13.054412841796875, "logits_per_token": -5.494387626647949, "logits_per_char": -0.49948978424072266, "num_chars": 11}, {"sum_logits": -2.7964682579040527, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -12.978740692138672, "logits_per_token": -2.7964682579040527, "logits_per_char": -0.1747792661190033, "num_chars": 16}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 152, "native_id": "Mercury_SC_400062", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.691579818725586, "incorrect_loss_raw": 15.238381385803223, "correct_loss_per_char": 0.8727530699509841, "incorrect_loss_per_char": 0.6957998938030666, "correct_loss_per_token": 3.241654259817941, "incorrect_loss_per_token": 2.8887009514702693, "correct_loss_uncond": -15.120000839233398, "incorrect_loss_uncond": -15.954941113789877}, "model_output": [{"sum_logits": -14.307779312133789, "num_tokens": 6, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -29.636363983154297, "logits_per_token": -2.3846298853556314, "logits_per_char": -0.6813228243873233, "num_chars": 21}, {"sum_logits": -16.36935806274414, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -30.5615234375, "logits_per_token": -3.2738716125488283, "logits_per_char": -0.7794932410830543, "num_chars": 21}, {"sum_logits": -15.038006782531738, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -33.382080078125, "logits_per_token": -3.0076013565063477, "logits_per_char": -0.6265836159388224, "num_chars": 24}, {"sum_logits": -22.691579818725586, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -37.811580657958984, "logits_per_token": -3.241654259817941, "logits_per_char": -0.8727530699509841, "num_chars": 26}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 153, "native_id": "Mercury_400699", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.594998359680176, "incorrect_loss_raw": 12.594154357910156, "correct_loss_per_char": 0.3485897015302609, "incorrect_loss_per_char": 0.32958487948121507, "correct_loss_per_token": 2.265833059946696, "incorrect_loss_per_token": 2.0990257263183594, "correct_loss_uncond": -16.997275352478027, "incorrect_loss_uncond": -17.692166646321613}, "model_output": [{"sum_logits": -13.619598388671875, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -30.69075584411621, "logits_per_token": -2.269933064778646, "logits_per_char": -0.37832217746310765, "num_chars": 36}, {"sum_logits": -13.594998359680176, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -30.592273712158203, "logits_per_token": -2.265833059946696, "logits_per_char": -0.3485897015302609, "num_chars": 39}, {"sum_logits": -9.922916412353516, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -29.36859130859375, "logits_per_token": -1.6538194020589192, "logits_per_char": -0.25443375416291064, "num_chars": 39}, {"sum_logits": -14.239948272705078, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -30.79961585998535, "logits_per_token": -2.373324712117513, "logits_per_char": -0.35599870681762696, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 154, "native_id": "Mercury_7029803", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.744630813598633, "incorrect_loss_raw": 14.825302124023438, "correct_loss_per_char": 0.24046473233204968, "incorrect_loss_per_char": 0.2999935001625776, "correct_loss_per_token": 1.1586028012362393, "incorrect_loss_per_token": 1.3881070050326259, "correct_loss_uncond": -28.25053596496582, "incorrect_loss_uncond": -23.715655008951824}, "model_output": [{"sum_logits": -13.316247940063477, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -37.4149284362793, "logits_per_token": -1.3316247940063477, "logits_per_char": -0.3170535223824637, "num_chars": 42}, {"sum_logits": -12.744630813598633, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -40.99516677856445, "logits_per_token": -1.1586028012362393, "logits_per_char": -0.24046473233204968, "num_chars": 53}, {"sum_logits": -16.87511444091797, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -38.265323638916016, "logits_per_token": -1.5341013128107244, "logits_per_char": -0.3183983856776975, "num_chars": 53}, {"sum_logits": -14.284543991088867, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -39.94261932373047, "logits_per_token": -1.2985949082808061, "logits_per_char": -0.2645285924275716, "num_chars": 54}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 155, "native_id": "Mercury_SC_401372", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.683874130249023, "incorrect_loss_raw": 13.291210492451986, "correct_loss_per_char": 0.9814043606028837, "incorrect_loss_per_char": 0.7181114867881492, "correct_loss_per_token": 5.561291376749675, "incorrect_loss_per_token": 3.3228026231129966, "correct_loss_uncond": -4.82347297668457, "incorrect_loss_uncond": -13.840669949849447}, "model_output": [{"sum_logits": -16.21782112121582, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -32.57837677001953, "logits_per_token": -4.054455280303955, "logits_per_char": -0.9009900622897677, "num_chars": 18}, {"sum_logits": -10.956111907958984, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.67959213256836, "logits_per_token": -2.739027976989746, "logits_per_char": -0.5478055953979493, "num_chars": 20}, {"sum_logits": -12.699698448181152, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.137672424316406, "logits_per_token": -3.174924612045288, "logits_per_char": -0.7055388026767306, "num_chars": 18}, {"sum_logits": -16.683874130249023, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -21.507347106933594, "logits_per_token": -5.561291376749675, "logits_per_char": -0.9814043606028837, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 156, "native_id": "Mercury_7271128", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 1.5058035850524902, "incorrect_loss_raw": 1.7523259123166401, "correct_loss_per_char": 0.18822544813156128, "incorrect_loss_per_char": 0.22890269720838186, "correct_loss_per_token": 0.7529017925262451, "incorrect_loss_per_token": 0.7841180132495033, "correct_loss_uncond": -14.798136234283447, "incorrect_loss_uncond": -14.37451277176539}, "model_output": [{"sum_logits": -1.6568089723587036, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -16.524904251098633, "logits_per_token": -0.5522696574529012, "logits_per_char": -0.23668699605124338, "num_chars": 7}, {"sum_logits": -0.8632369637489319, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": true, "sum_logits_uncond": -15.060075759887695, "logits_per_token": -0.43161848187446594, "logits_per_char": -0.10790462046861649, "num_chars": 8}, {"sum_logits": -1.5058035850524902, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.303939819335938, "logits_per_token": -0.7529017925262451, "logits_per_char": -0.18822544813156128, "num_chars": 8}, {"sum_logits": -2.736931800842285, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.795536041259766, "logits_per_token": -1.3684659004211426, "logits_per_char": -0.34211647510528564, "num_chars": 8}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 157, "native_id": "Mercury_407260", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.31987953186035, "incorrect_loss_raw": 15.472041130065918, "correct_loss_per_char": 0.8883283138275146, "incorrect_loss_per_char": 0.5230882553290271, "correct_loss_per_token": 3.0456970759800504, "incorrect_loss_per_token": 2.4970523993174236, "correct_loss_uncond": -10.139867782592773, "incorrect_loss_uncond": -11.177900632222494}, "model_output": [{"sum_logits": -16.54108238220215, "num_tokens": 5, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -24.726551055908203, "logits_per_token": -3.3082164764404296, "logits_per_char": -0.6361954762385442, "num_chars": 26}, {"sum_logits": -21.31987953186035, "num_tokens": 7, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -31.459747314453125, "logits_per_token": -3.0456970759800504, "logits_per_char": -0.8883283138275146, "num_chars": 24}, {"sum_logits": -19.109586715698242, "num_tokens": 8, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -30.96794319152832, "logits_per_token": -2.3886983394622803, "logits_per_char": -0.6164382811515562, "num_chars": 31}, {"sum_logits": -10.765454292297363, "num_tokens": 6, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -24.25533103942871, "logits_per_token": -1.7942423820495605, "logits_per_char": -0.3166310085969813, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 158, "native_id": "Mercury_SC_416155", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.952600479125977, "incorrect_loss_raw": 7.320852438608806, "correct_loss_per_char": 0.4476300239562988, "incorrect_loss_per_char": 0.34915849407440325, "correct_loss_per_token": 2.9842001597086587, "incorrect_loss_per_token": 2.272953364584181, "correct_loss_uncond": -9.257028579711914, "incorrect_loss_uncond": -8.596923669179281}, "model_output": [{"sum_logits": -6.0239081382751465, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.740318298339844, "logits_per_token": -1.5059770345687866, "logits_per_char": -0.26190904949022376, "num_chars": 23}, {"sum_logits": -5.303336143493652, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.203489303588867, "logits_per_token": -1.767778714497884, "logits_per_char": -0.27912295492071854, "num_chars": 19}, {"sum_logits": -8.952600479125977, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -18.20962905883789, "logits_per_token": -2.9842001597086587, "logits_per_char": -0.4476300239562988, "num_chars": 20}, {"sum_logits": -10.635313034057617, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.809520721435547, "logits_per_token": -3.5451043446858725, "logits_per_char": -0.5064434778122675, "num_chars": 21}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 159, "native_id": "Mercury_402145", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.611892700195312, "incorrect_loss_raw": 12.621503512064615, "correct_loss_per_char": 0.8005405772816051, "incorrect_loss_per_char": 0.5150720897902791, "correct_loss_per_token": 3.5223785400390626, "incorrect_loss_per_token": 3.2643914116753474, "correct_loss_uncond": -11.816646575927734, "incorrect_loss_uncond": -12.990981101989746}, "model_output": [{"sum_logits": -9.652000427246094, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -20.02715492248535, "logits_per_token": -4.826000213623047, "logits_per_char": -0.6434666951497395, "num_chars": 15}, {"sum_logits": -17.611892700195312, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -29.428539276123047, "logits_per_token": -3.5223785400390626, "logits_per_char": -0.8005405772816051, "num_chars": 22}, {"sum_logits": -7.952670097351074, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -24.44432830810547, "logits_per_token": -1.5905340194702149, "logits_per_char": -0.3058719268211952, "num_chars": 26}, {"sum_logits": -20.25984001159668, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -32.365970611572266, "logits_per_token": -3.37664000193278, "logits_per_char": -0.5958776473999023, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 160, "native_id": "AIMS_2009_4_5", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.14875602722168, "incorrect_loss_raw": 21.838586171468098, "correct_loss_per_char": 0.5716252009073893, "incorrect_loss_per_char": 0.6086863658023021, "correct_loss_per_token": 4.28718900680542, "incorrect_loss_per_token": 3.005588190896171, "correct_loss_uncond": -13.623086929321289, "incorrect_loss_uncond": -12.38284683227539}, "model_output": [{"sum_logits": -17.14875602722168, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.77184295654297, "logits_per_token": -4.28718900680542, "logits_per_char": -0.5716252009073893, "num_chars": 30}, {"sum_logits": -19.29244613647461, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -32.63752746582031, "logits_per_token": -2.7560637337820872, "logits_per_char": -0.5846195798931699, "num_chars": 33}, {"sum_logits": -19.187252044677734, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.2326774597168, "logits_per_token": -2.398406505584717, "logits_per_char": -0.5482072012765067, "num_chars": 35}, {"sum_logits": -27.036060333251953, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -35.79409408569336, "logits_per_token": -3.8622943333217075, "logits_per_char": -0.6932323162372296, "num_chars": 39}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 161, "native_id": "TIMSS_2003_4_pg7", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.111858367919922, "incorrect_loss_raw": 11.153103192647299, "correct_loss_per_char": 0.7222755977085659, "incorrect_loss_per_char": 0.7899872723747702, "correct_loss_per_token": 3.370619455973307, "incorrect_loss_per_token": 3.7177010642157655, "correct_loss_uncond": -6.44476318359375, "incorrect_loss_uncond": -4.150039354960124}, "model_output": [{"sum_logits": -10.950963973999023, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.128266334533691, "logits_per_token": -3.6503213246663413, "logits_per_char": -0.7300642649332683, "num_chars": 15}, {"sum_logits": -12.88779067993164, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.48748779296875, "logits_per_token": -4.29593022664388, "logits_per_char": -1.07398255666097, "num_chars": 12}, {"sum_logits": -9.62055492401123, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.293673515319824, "logits_per_token": -3.2068516413370767, "logits_per_char": -0.5659149955300724, "num_chars": 17}, {"sum_logits": -10.111858367919922, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -16.556621551513672, "logits_per_token": -3.370619455973307, "logits_per_char": -0.7222755977085659, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 162, "native_id": "Mercury_7142415", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.461310386657715, "incorrect_loss_raw": 15.459487597147623, "correct_loss_per_char": 0.19635027333309776, "incorrect_loss_per_char": 0.34703963903672785, "correct_loss_per_token": 1.065901483808245, "incorrect_loss_per_token": 1.730614799923367, "correct_loss_uncond": -21.403996467590332, "incorrect_loss_uncond": -17.125299135843914}, "model_output": [{"sum_logits": -7.461310386657715, "num_tokens": 7, "num_tokens_all": 240, "is_greedy": false, "sum_logits_uncond": -28.865306854248047, "logits_per_token": -1.065901483808245, "logits_per_char": -0.19635027333309776, "num_chars": 38}, {"sum_logits": -14.431105613708496, "num_tokens": 10, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -33.94457244873047, "logits_per_token": -1.4431105613708497, "logits_per_char": -0.2531772914685701, "num_chars": 57}, {"sum_logits": -17.61737823486328, "num_tokens": 9, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -31.76055908203125, "logits_per_token": -1.9574864705403645, "logits_per_char": -0.42969215206983613, "num_chars": 41}, {"sum_logits": -14.329978942871094, "num_tokens": 8, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -32.04922866821289, "logits_per_token": -1.7912473678588867, "logits_per_char": -0.35824947357177733, "num_chars": 40}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 163, "native_id": "Mercury_7212818", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.733312606811523, "incorrect_loss_raw": 27.828821182250977, "correct_loss_per_char": 0.4836875022725856, "incorrect_loss_per_char": 0.5470644544812918, "correct_loss_per_token": 2.8416640758514404, "incorrect_loss_per_token": 3.1846793038504466, "correct_loss_uncond": -16.442285537719727, "incorrect_loss_uncond": -13.60889752705892}, "model_output": [{"sum_logits": -23.830747604370117, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.31309127807617, "logits_per_token": -2.3830747604370117, "logits_per_char": -0.4180832913047389, "num_chars": 57}, {"sum_logits": -22.733312606811523, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -39.17559814453125, "logits_per_token": -2.8416640758514404, "logits_per_char": -0.4836875022725856, "num_chars": 47}, {"sum_logits": -31.5299129486084, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -46.83359909057617, "logits_per_token": -3.15299129486084, "logits_per_char": -0.5838872768260814, "num_chars": 54}, {"sum_logits": -28.125802993774414, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -40.166465759277344, "logits_per_token": -4.017971856253488, "logits_per_char": -0.6392227953130548, "num_chars": 44}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 164, "native_id": "Mercury_SC_413299", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.564307928085327, "incorrect_loss_raw": 7.748682975769043, "correct_loss_per_char": 0.4455384910106659, "incorrect_loss_per_char": 0.47700345983692244, "correct_loss_per_token": 1.7821539640426636, "incorrect_loss_per_token": 3.0288514561123314, "correct_loss_uncond": -13.1333487033844, "incorrect_loss_uncond": -9.118207613627115}, "model_output": [{"sum_logits": -3.564307928085327, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.697656631469727, "logits_per_token": -1.7821539640426636, "logits_per_char": -0.4455384910106659, "num_chars": 8}, {"sum_logits": -5.9480085372924805, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.776866912841797, "logits_per_token": -1.982669512430827, "logits_per_char": -0.3498828551348518, "num_chars": 17}, {"sum_logits": -9.270812034606934, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -18.027372360229492, "logits_per_token": -3.090270678202311, "logits_per_char": -0.5794257521629333, "num_chars": 16}, {"sum_logits": -8.027228355407715, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.796432495117188, "logits_per_token": -4.013614177703857, "logits_per_char": -0.5017017722129822, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 165, "native_id": "Mercury_7132020", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.737470865249634, "incorrect_loss_raw": 4.274992863337199, "correct_loss_per_char": 0.41527454058329266, "incorrect_loss_per_char": 0.5595130962984903, "correct_loss_per_token": 3.737470865249634, "incorrect_loss_per_token": 4.274992863337199, "correct_loss_uncond": -8.927098989486694, "incorrect_loss_uncond": -10.222226222356161}, "model_output": [{"sum_logits": -3.737470865249634, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.664569854736328, "logits_per_token": -3.737470865249634, "logits_per_char": -0.41527454058329266, "num_chars": 9}, {"sum_logits": -3.461418390274048, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.608772277832031, "logits_per_token": -3.461418390274048, "logits_per_char": -0.432677298784256, "num_chars": 8}, {"sum_logits": -5.140210151672363, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.369495391845703, "logits_per_token": -5.140210151672363, "logits_per_char": -0.6425262689590454, "num_chars": 8}, {"sum_logits": -4.2233500480651855, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -14.513389587402344, "logits_per_token": -4.2233500480651855, "logits_per_char": -0.6033357211521694, "num_chars": 7}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 166, "native_id": "MEA_2014_8_10", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.165483474731445, "incorrect_loss_raw": 10.406920274098715, "correct_loss_per_char": 0.44197754237962805, "incorrect_loss_per_char": 0.49995835988949505, "correct_loss_per_token": 2.5413708686828613, "incorrect_loss_per_token": 2.7879582246144614, "correct_loss_uncond": -19.256845474243164, "incorrect_loss_uncond": -11.793426990509033}, "model_output": [{"sum_logits": -6.704213619232178, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -22.372699737548828, "logits_per_token": -2.2347378730773926, "logits_per_char": -0.37245631217956543, "num_chars": 18}, {"sum_logits": -8.5287446975708, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.79014778137207, "logits_per_token": -2.1321861743927, "logits_per_char": -0.3280286422142616, "num_chars": 26}, {"sum_logits": -10.165483474731445, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -29.42232894897461, "logits_per_token": -2.5413708686828613, "logits_per_char": -0.44197754237962805, "num_chars": 23}, {"sum_logits": -15.987802505493164, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.438194274902344, "logits_per_token": -3.996950626373291, "logits_per_char": -0.7993901252746582, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 167, "native_id": "TIMSS_1995_8_N2", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.13396644592285, "incorrect_loss_raw": 21.231108983357746, "correct_loss_per_char": 0.647641658782959, "incorrect_loss_per_char": 0.7140846893952059, "correct_loss_per_token": 3.022327740987142, "incorrect_loss_per_token": 2.865981137311017, "correct_loss_uncond": -6.471019744873047, "incorrect_loss_uncond": -7.472065607706706}, "model_output": [{"sum_logits": -27.827669143676758, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.08026123046875, "logits_per_token": -3.0919632381863065, "logits_per_char": -0.7520991660453178, "num_chars": 37}, {"sum_logits": -18.13396644592285, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -24.6049861907959, "logits_per_token": -3.022327740987142, "logits_per_char": -0.647641658782959, "num_chars": 28}, {"sum_logits": -19.80843734741211, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -23.546838760375977, "logits_per_token": -2.8297767639160156, "logits_per_char": -0.660281244913737, "num_chars": 30}, {"sum_logits": -16.057220458984375, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.482423782348633, "logits_per_token": -2.676203409830729, "logits_per_char": -0.7298736572265625, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 168, "native_id": "Mercury_7024465", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.487638473510742, "incorrect_loss_raw": 11.707183202107748, "correct_loss_per_char": 0.3228022631476907, "incorrect_loss_per_char": 0.5628767588781932, "correct_loss_per_token": 1.8292128245035808, "incorrect_loss_per_token": 3.0818007151285802, "correct_loss_uncond": -15.804986953735352, "incorrect_loss_uncond": -11.08853530883789}, "model_output": [{"sum_logits": -5.487638473510742, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.292625427246094, "logits_per_token": -1.8292128245035808, "logits_per_char": -0.3228022631476907, "num_chars": 17}, {"sum_logits": -16.658191680908203, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.756643295288086, "logits_per_token": -5.552730560302734, "logits_per_char": -0.7932472229003906, "num_chars": 21}, {"sum_logits": -11.669633865356445, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.30722427368164, "logits_per_token": -2.333926773071289, "logits_per_char": -0.5556968507312593, "num_chars": 21}, {"sum_logits": -6.793724060058594, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.323287963867188, "logits_per_token": -1.3587448120117187, "logits_per_char": -0.3396862030029297, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 169, "native_id": "Mercury_SC_415762", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 25.889789581298828, "incorrect_loss_raw": 19.845535278320312, "correct_loss_per_char": 0.3546546517986141, "incorrect_loss_per_char": 0.4896812683264716, "correct_loss_per_token": 1.8492706843784876, "incorrect_loss_per_token": 2.645921336279975, "correct_loss_uncond": -17.92050552368164, "incorrect_loss_uncond": -11.87863032023112}, "model_output": [{"sum_logits": -18.693161010742188, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -28.02587890625, "logits_per_token": -3.1155268351236978, "logits_per_char": -0.5841612815856934, "num_chars": 32}, {"sum_logits": -17.023401260375977, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -27.462596893310547, "logits_per_token": -2.837233543395996, "logits_per_char": -0.5006882723639993, "num_chars": 34}, {"sum_logits": -23.820043563842773, "num_tokens": 12, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -39.68402099609375, "logits_per_token": -1.9850036303202312, "logits_per_char": -0.38419425102972216, "num_chars": 62}, {"sum_logits": -25.889789581298828, "num_tokens": 14, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -43.81029510498047, "logits_per_token": -1.8492706843784876, "logits_per_char": -0.3546546517986141, "num_chars": 73}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 170, "native_id": "Mercury_415093", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 39.39633560180664, "incorrect_loss_raw": 35.00937016805013, "correct_loss_per_char": 0.9380079905192057, "incorrect_loss_per_char": 0.8461139825162988, "correct_loss_per_token": 1.4070119857788086, "incorrect_loss_per_token": 1.2789379570639743, "correct_loss_uncond": -22.62820816040039, "incorrect_loss_uncond": -19.70402654012044}, "model_output": [{"sum_logits": -31.018104553222656, "num_tokens": 27, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -50.633934020996094, "logits_per_token": -1.1488186871563946, "logits_per_char": -0.7565391354444551, "num_chars": 41}, {"sum_logits": -33.85419845581055, "num_tokens": 27, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -53.00017547607422, "logits_per_token": -1.2538592020670574, "logits_per_char": -0.8257121574587938, "num_chars": 41}, {"sum_logits": -40.15580749511719, "num_tokens": 28, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -60.506080627441406, "logits_per_token": -1.434135981968471, "logits_per_char": -0.9560906546456474, "num_chars": 42}, {"sum_logits": -39.39633560180664, "num_tokens": 28, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -62.02454376220703, "logits_per_token": -1.4070119857788086, "logits_per_char": -0.9380079905192057, "num_chars": 42}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 171, "native_id": "LEAP_2005_8_10404", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 32.410179138183594, "incorrect_loss_raw": 33.90170415242513, "correct_loss_per_char": 0.6115128139279923, "incorrect_loss_per_char": 0.6721460202015294, "correct_loss_per_token": 2.946379921653054, "incorrect_loss_per_token": 3.6634829245031084, "correct_loss_uncond": -10.871231079101562, "incorrect_loss_uncond": -11.687460581461588}, "model_output": [{"sum_logits": -32.0516357421875, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -40.966827392578125, "logits_per_token": -3.5612928602430554, "logits_per_char": -0.7453868777252907, "num_chars": 43}, {"sum_logits": -37.47417449951172, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -47.77781677246094, "logits_per_token": -3.406743136319247, "logits_per_char": -0.6143307295001921, "num_chars": 61}, {"sum_logits": -32.17930221557617, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -48.022850036621094, "logits_per_token": -4.0224127769470215, "logits_per_char": -0.6567204533791056, "num_chars": 49}, {"sum_logits": -32.410179138183594, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -43.281410217285156, "logits_per_token": -2.946379921653054, "logits_per_char": -0.6115128139279923, "num_chars": 53}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 172, "native_id": "AIMS_2008_8_6", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 19.076465606689453, "incorrect_loss_raw": 29.036579767862957, "correct_loss_per_char": 0.4335560365156694, "incorrect_loss_per_char": 0.5468228141764281, "correct_loss_per_token": 1.9076465606689452, "incorrect_loss_per_token": 2.67245769970036, "correct_loss_uncond": -14.315788269042969, "incorrect_loss_uncond": -11.531515121459961}, "model_output": [{"sum_logits": -25.159866333007812, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.37425994873047, "logits_per_token": -2.795540703667535, "logits_per_char": -0.5851131705350654, "num_chars": 43}, {"sum_logits": -19.076465606689453, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -33.39225387573242, "logits_per_token": -1.9076465606689452, "logits_per_char": -0.4335560365156694, "num_chars": 44}, {"sum_logits": -32.636714935302734, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -49.260650634765625, "logits_per_token": -2.9669740850275215, "logits_per_char": -0.6043836099130137, "num_chars": 54}, {"sum_logits": -29.31315803527832, "num_tokens": 13, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -40.069374084472656, "logits_per_token": -2.2548583104060245, "logits_per_char": -0.4509716620812049, "num_chars": 65}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 173, "native_id": "Mercury_7057173", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.296321868896484, "incorrect_loss_raw": 14.664311726888021, "correct_loss_per_char": 0.9586071687586167, "incorrect_loss_per_char": 0.6661116512717467, "correct_loss_per_token": 8.148160934448242, "incorrect_loss_per_token": 6.249586317274306, "correct_loss_uncond": -6.702077865600586, "incorrect_loss_uncond": -6.568382263183594}, "model_output": [{"sum_logits": -12.447256088256836, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -20.44597625732422, "logits_per_token": -6.223628044128418, "logits_per_char": -0.565784367648038, "num_chars": 22}, {"sum_logits": -12.059427261352539, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.168724060058594, "logits_per_token": -6.0297136306762695, "logits_per_char": -0.8039618174235026, "num_chars": 15}, {"sum_logits": -19.486251831054688, "num_tokens": 3, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -25.08338165283203, "logits_per_token": -6.4954172770182295, "logits_per_char": -0.6285887687436996, "num_chars": 31}, {"sum_logits": -16.296321868896484, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.99839973449707, "logits_per_token": -8.148160934448242, "logits_per_char": -0.9586071687586167, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 174, "native_id": "TIMSS_2007_8_pg60", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.727123260498047, "incorrect_loss_raw": 6.131837209065755, "correct_loss_per_char": 1.7454246520996093, "incorrect_loss_per_char": 0.8592757346138122, "correct_loss_per_token": 8.727123260498047, "incorrect_loss_per_token": 4.578032811482747, "correct_loss_uncond": -1.698507308959961, "incorrect_loss_uncond": -5.245902061462402}, "model_output": [{"sum_logits": -4.184892654418945, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -9.982552528381348, "logits_per_token": -4.184892654418945, "logits_per_char": -0.5978418077741351, "num_chars": 7}, {"sum_logits": -7.218499183654785, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -11.402114868164062, "logits_per_token": -7.218499183654785, "logits_per_char": -1.2030831972757976, "num_chars": 6}, {"sum_logits": -8.727123260498047, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -10.425630569458008, "logits_per_token": -8.727123260498047, "logits_per_char": -1.7454246520996093, "num_chars": 5}, {"sum_logits": -6.992119789123535, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.748550415039062, "logits_per_token": -2.3307065963745117, "logits_per_char": -0.7769021987915039, "num_chars": 9}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 175, "native_id": "AIMS_2009_8_14", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.478546142578125, "incorrect_loss_raw": 20.293482462565105, "correct_loss_per_char": 0.3521073138127562, "incorrect_loss_per_char": 0.329286030648815, "correct_loss_per_token": 1.7898788452148438, "incorrect_loss_per_token": 1.7062982256873782, "correct_loss_uncond": -19.507652282714844, "incorrect_loss_uncond": -18.85279591878255}, "model_output": [{"sum_logits": -21.478546142578125, "num_tokens": 12, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -40.98619842529297, "logits_per_token": -1.7898788452148438, "logits_per_char": -0.3521073138127562, "num_chars": 61}, {"sum_logits": -19.33789825439453, "num_tokens": 13, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -40.61022186279297, "logits_per_token": -1.4875306349534254, "logits_per_char": -0.3119015847482989, "num_chars": 62}, {"sum_logits": -22.372013092041016, "num_tokens": 11, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -38.219703674316406, "logits_per_token": -2.0338193720037285, "logits_per_char": -0.36675431298427896, "num_chars": 61}, {"sum_logits": -19.170536041259766, "num_tokens": 12, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -38.608909606933594, "logits_per_token": -1.5975446701049805, "logits_per_char": -0.3092021942138672, "num_chars": 62}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 176, "native_id": "Mercury_185010", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.8429532051086426, "incorrect_loss_raw": 5.893664121627808, "correct_loss_per_char": 0.3493593822826039, "incorrect_loss_per_char": 0.47115275126237144, "correct_loss_per_token": 1.9214766025543213, "incorrect_loss_per_token": 1.7863160636689928, "correct_loss_uncond": -12.646697521209717, "incorrect_loss_uncond": -11.73592178026835}, "model_output": [{"sum_logits": -3.8429532051086426, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -16.48965072631836, "logits_per_token": -1.9214766025543213, "logits_per_char": -0.3493593822826039, "num_chars": 11}, {"sum_logits": -11.043024063110352, "num_tokens": 4, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -19.69070816040039, "logits_per_token": -2.760756015777588, "logits_per_char": -0.8494633894700271, "num_chars": 13}, {"sum_logits": -2.3132164478302, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": true, "sum_logits_uncond": -16.53949737548828, "logits_per_token": -1.1566082239151, "logits_per_char": -0.23132164478302003, "num_chars": 10}, {"sum_logits": -4.324751853942871, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -16.658552169799805, "logits_per_token": -1.4415839513142903, "logits_per_char": -0.33267321953406703, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 177, "native_id": "Mercury_7206938", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.331600189208984, "incorrect_loss_raw": 18.04247283935547, "correct_loss_per_char": 0.6100301922492262, "incorrect_loss_per_char": 0.40903331685949257, "correct_loss_per_token": 4.041450023651123, "incorrect_loss_per_token": 2.5134369267357717, "correct_loss_uncond": -9.288780212402344, "incorrect_loss_uncond": -18.315959930419922}, "model_output": [{"sum_logits": -32.331600189208984, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -41.62038040161133, "logits_per_token": -4.041450023651123, "logits_per_char": -0.6100301922492262, "num_chars": 53}, {"sum_logits": -18.585203170776367, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -35.1102294921875, "logits_per_token": -3.097533861796061, "logits_per_char": -0.5162556436326768, "num_chars": 36}, {"sum_logits": -20.904998779296875, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -37.968841552734375, "logits_per_token": -2.6131248474121094, "logits_per_char": -0.4180999755859375, "num_chars": 50}, {"sum_logits": -14.637216567993164, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -35.9962272644043, "logits_per_token": -1.8296520709991455, "logits_per_char": -0.2927443313598633, "num_chars": 50}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 178, "native_id": "Mercury_402501", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 14.298311233520508, "incorrect_loss_raw": 11.476777791976929, "correct_loss_per_char": 1.5887012481689453, "incorrect_loss_per_char": 1.1337290566870968, "correct_loss_per_token": 2.8596622467041017, "incorrect_loss_per_token": 2.060909867286682, "correct_loss_uncond": -18.136499404907227, "incorrect_loss_uncond": -16.91524402300517}, "model_output": [{"sum_logits": -2.168593645095825, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": true, "sum_logits_uncond": -19.624269485473633, "logits_per_token": -1.0842968225479126, "logits_per_char": -0.43371872901916503, "num_chars": 5}, {"sum_logits": -14.298311233520508, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -32.434810638427734, "logits_per_token": -2.8596622467041017, "logits_per_char": -1.5887012481689453, "num_chars": 9}, {"sum_logits": -14.209537506103516, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.430652618408203, "logits_per_token": -2.841907501220703, "logits_per_char": -1.5788375006781683, "num_chars": 9}, {"sum_logits": -18.052202224731445, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -35.12114334106445, "logits_per_token": -2.2565252780914307, "logits_per_char": -1.3886309403639574, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 179, "native_id": "MCAS_2011_8_15365", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.015542030334473, "incorrect_loss_raw": 11.47188409169515, "correct_loss_per_char": 0.8195947300304066, "incorrect_loss_per_char": 1.18798929143835, "correct_loss_per_token": 4.507771015167236, "incorrect_loss_per_token": 5.735942045847575, "correct_loss_uncond": -5.9671220779418945, "incorrect_loss_uncond": -4.560559590657552}, "model_output": [{"sum_logits": -11.01623821258545, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.36313819885254, "logits_per_token": -5.508119106292725, "logits_per_char": -1.2240264680650499, "num_chars": 9}, {"sum_logits": -9.937137603759766, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -14.982743263244629, "logits_per_token": -4.968568801879883, "logits_per_char": -0.9937137603759766, "num_chars": 10}, {"sum_logits": -9.015542030334473, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -14.982664108276367, "logits_per_token": -4.507771015167236, "logits_per_char": -0.8195947300304066, "num_chars": 11}, {"sum_logits": -13.462276458740234, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.751449584960938, "logits_per_token": -6.731138229370117, "logits_per_char": -1.3462276458740234, "num_chars": 10}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 180, "native_id": "Mercury_SC_401766", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.471149444580078, "incorrect_loss_raw": 12.020799318949381, "correct_loss_per_char": 0.6791954040527344, "incorrect_loss_per_char": 0.9955147902170817, "correct_loss_per_token": 2.4903831481933594, "incorrect_loss_per_token": 7.070408185323079, "correct_loss_uncond": -7.678312301635742, "incorrect_loss_uncond": -5.498496691385905}, "model_output": [{"sum_logits": -7.471149444580078, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -15.14946174621582, "logits_per_token": -2.4903831481933594, "logits_per_char": -0.6791954040527344, "num_chars": 11}, {"sum_logits": -6.360051155090332, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.143674850463867, "logits_per_token": -6.360051155090332, "logits_per_char": -0.7950063943862915, "num_chars": 8}, {"sum_logits": -16.086782455444336, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -20.662738800048828, "logits_per_token": -8.043391227722168, "logits_per_char": -1.3405652046203613, "num_chars": 12}, {"sum_logits": -13.615564346313477, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.751474380493164, "logits_per_token": -6.807782173156738, "logits_per_char": -0.8509727716445923, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 181, "native_id": "Mercury_7162400", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.447149276733398, "incorrect_loss_raw": 13.6925630569458, "correct_loss_per_char": 0.3035890067495951, "incorrect_loss_per_char": 0.44883177106083855, "correct_loss_per_token": 1.383016586303711, "incorrect_loss_per_token": 2.827603374208723, "correct_loss_uncond": -15.563770294189453, "incorrect_loss_uncond": -8.603333473205566}, "model_output": [{"sum_logits": -10.620166778564453, "num_tokens": 4, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -19.04323959350586, "logits_per_token": -2.6550416946411133, "logits_per_char": -0.40846795302170974, "num_chars": 26}, {"sum_logits": -13.782475471496582, "num_tokens": 4, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -22.079252243041992, "logits_per_token": -3.4456188678741455, "logits_per_char": -0.5104620544998734, "num_chars": 27}, {"sum_logits": -16.675046920776367, "num_tokens": 7, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -25.76519775390625, "logits_per_token": -2.3821495601109097, "logits_per_char": -0.4275653056609325, "num_chars": 39}, {"sum_logits": -12.447149276733398, "num_tokens": 9, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -28.01091957092285, "logits_per_token": -1.383016586303711, "logits_per_char": -0.3035890067495951, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 182, "native_id": "Mercury_7086695", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.241001605987549, "incorrect_loss_raw": 4.257900794347127, "correct_loss_per_char": 0.4764546914534135, "incorrect_loss_per_char": 0.3660791035653766, "correct_loss_per_token": 2.6205008029937744, "incorrect_loss_per_token": 2.4640023443433976, "correct_loss_uncond": -12.141692638397217, "incorrect_loss_uncond": -12.261523644129435}, "model_output": [{"sum_logits": -3.8380491733551025, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.918476104736328, "logits_per_token": -3.8380491733551025, "logits_per_char": -0.4797561466693878, "num_chars": 8}, {"sum_logits": -5.483212471008301, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.843334197998047, "logits_per_token": -1.8277374903361003, "logits_per_char": -0.3046229150560167, "num_chars": 18}, {"sum_logits": -5.241001605987549, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.382694244384766, "logits_per_token": -2.6205008029937744, "logits_per_char": -0.4764546914534135, "num_chars": 11}, {"sum_logits": -3.4524407386779785, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -16.796463012695312, "logits_per_token": -1.7262203693389893, "logits_per_char": -0.31385824897072534, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 183, "native_id": "Mercury_SC_402994", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.653541564941406, "incorrect_loss_raw": 16.69248867034912, "correct_loss_per_char": 0.49037615458170575, "incorrect_loss_per_char": 0.468428474270331, "correct_loss_per_token": 2.5219345092773438, "incorrect_loss_per_token": 2.252885001046317, "correct_loss_uncond": -15.962547302246094, "incorrect_loss_uncond": -18.475940386454266}, "model_output": [{"sum_logits": -22.135047912597656, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -35.29487609863281, "logits_per_token": -2.766880989074707, "logits_per_char": -0.6510308209587546, "num_chars": 34}, {"sum_logits": -12.946621894836426, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -33.13429641723633, "logits_per_token": -1.8495174135480608, "logits_per_char": -0.3596283859676785, "num_chars": 36}, {"sum_logits": -17.653541564941406, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -33.6160888671875, "logits_per_token": -2.5219345092773438, "logits_per_char": -0.49037615458170575, "num_chars": 36}, {"sum_logits": -14.995796203613281, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -37.076114654541016, "logits_per_token": -2.142256600516183, "logits_per_char": -0.39462621588456004, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 184, "native_id": "Mercury_7056298", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.65396785736084, "incorrect_loss_raw": 15.98325284322103, "correct_loss_per_char": 0.3403036490730617, "incorrect_loss_per_char": 0.3426478583247361, "correct_loss_per_token": 1.956745982170105, "incorrect_loss_per_token": 1.9979066054026287, "correct_loss_uncond": -19.377934455871582, "incorrect_loss_uncond": -17.105595270792644}, "model_output": [{"sum_logits": -15.65396785736084, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -35.03190231323242, "logits_per_token": -1.956745982170105, "logits_per_char": -0.3403036490730617, "num_chars": 46}, {"sum_logits": -16.725116729736328, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -34.519325256347656, "logits_per_token": -2.090639591217041, "logits_per_char": -0.36358949412470276, "num_chars": 46}, {"sum_logits": -16.240623474121094, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -32.354713439941406, "logits_per_token": -2.0300779342651367, "logits_per_char": -0.3455451803004488, "num_chars": 47}, {"sum_logits": -14.984018325805664, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -32.39250564575195, "logits_per_token": -1.873002290725708, "logits_per_char": -0.3188089005490567, "num_chars": 47}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 185, "native_id": "Mercury_409115", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.014036178588867, "incorrect_loss_raw": 15.05569871266683, "correct_loss_per_char": 0.29531206850145686, "incorrect_loss_per_char": 0.28746481899355275, "correct_loss_per_token": 2.001559575398763, "incorrect_loss_per_token": 1.8000172200026334, "correct_loss_uncond": -18.861101150512695, "incorrect_loss_uncond": -19.546250343322754}, "model_output": [{"sum_logits": -13.955474853515625, "num_tokens": 8, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -34.749244689941406, "logits_per_token": -1.7444343566894531, "logits_per_char": -0.2907390594482422, "num_chars": 48}, {"sum_logits": -13.511475563049316, "num_tokens": 8, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -32.62956237792969, "logits_per_token": -1.6889344453811646, "logits_per_char": -0.2814890742301941, "num_chars": 48}, {"sum_logits": -18.014036178588867, "num_tokens": 9, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -36.87513732910156, "logits_per_token": -2.001559575398763, "logits_per_char": -0.29531206850145686, "num_chars": 61}, {"sum_logits": -17.700145721435547, "num_tokens": 9, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -36.427040100097656, "logits_per_token": -1.966682857937283, "logits_per_char": -0.2901663233022221, "num_chars": 61}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 186, "native_id": "Mercury_409647", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.818017959594727, "incorrect_loss_raw": 17.53576405843099, "correct_loss_per_char": 0.6163603591918946, "incorrect_loss_per_char": 0.4014825962058586, "correct_loss_per_token": 2.8016379963267934, "incorrect_loss_per_token": 1.9979221909134477, "correct_loss_uncond": -14.110937118530273, "incorrect_loss_uncond": -16.381148020426433}, "model_output": [{"sum_logits": -16.70165252685547, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -31.32823944091797, "logits_per_token": -1.8557391696506076, "logits_per_char": -0.38841052388035974, "num_chars": 43}, {"sum_logits": -10.692855834960938, "num_tokens": 8, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -22.45501708984375, "logits_per_token": -1.3366069793701172, "logits_per_char": -0.24301945079456677, "num_chars": 44}, {"sum_logits": -25.212783813476562, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -47.96747970581055, "logits_per_token": -2.801420423719618, "logits_per_char": -0.5730178139426492, "num_chars": 44}, {"sum_logits": -30.818017959594727, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -44.928955078125, "logits_per_token": -2.8016379963267934, "logits_per_char": -0.6163603591918946, "num_chars": 50}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 187, "native_id": "Mercury_414352", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.50935935974121, "incorrect_loss_raw": 18.16290028889974, "correct_loss_per_char": 0.7861599695114863, "incorrect_loss_per_char": 0.7065330468283758, "correct_loss_per_token": 3.3018718719482423, "incorrect_loss_per_token": 2.3066628581345685, "correct_loss_uncond": -8.823907852172852, "incorrect_loss_uncond": -14.663860321044922}, "model_output": [{"sum_logits": -16.50935935974121, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -25.333267211914062, "logits_per_token": -3.3018718719482423, "logits_per_char": -0.7861599695114863, "num_chars": 21}, {"sum_logits": -24.519779205322266, "num_tokens": 9, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -39.00919723510742, "logits_per_token": -2.724419911702474, "logits_per_char": -0.9807911682128906, "num_chars": 25}, {"sum_logits": -19.418800354003906, "num_tokens": 6, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -33.207313537597656, "logits_per_token": -3.236466725667318, "logits_per_char": -0.8091166814168295, "num_chars": 24}, {"sum_logits": -10.550121307373047, "num_tokens": 11, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -26.263771057128906, "logits_per_token": -0.9591019370339133, "logits_per_char": -0.3296912908554077, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 188, "native_id": "Mercury_185325", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.232295513153076, "incorrect_loss_raw": 12.303287506103516, "correct_loss_per_char": 0.29384504665027966, "incorrect_loss_per_char": 1.0632524066501194, "correct_loss_per_token": 1.616147756576538, "incorrect_loss_per_token": 4.963925361633301, "correct_loss_uncond": -13.07570505142212, "incorrect_loss_uncond": -5.08436393737793}, "model_output": [{"sum_logits": -11.404268264770508, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.69904136657715, "logits_per_token": -5.702134132385254, "logits_per_char": -1.1404268264770507, "num_chars": 10}, {"sum_logits": -11.87718391418457, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -17.33124351501465, "logits_per_token": -2.375436782836914, "logits_per_char": -0.9136295318603516, "num_chars": 13}, {"sum_logits": -3.232295513153076, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.308000564575195, "logits_per_token": -1.616147756576538, "logits_per_char": -0.29384504665027966, "num_chars": 11}, {"sum_logits": -13.628410339355469, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -18.13266944885254, "logits_per_token": -6.814205169677734, "logits_per_char": -1.1357008616129558, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 189, "native_id": "Mercury_SC_412374", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.313142776489258, "incorrect_loss_raw": 13.803790410359701, "correct_loss_per_char": 0.6480601461310136, "incorrect_loss_per_char": 0.8564250337532143, "correct_loss_per_token": 3.0782856941223145, "incorrect_loss_per_token": 3.717605617311266, "correct_loss_uncond": -11.66050910949707, "incorrect_loss_uncond": -12.481454849243164}, "model_output": [{"sum_logits": -9.599688529968262, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -23.16071319580078, "logits_per_token": -3.1998961766560874, "logits_per_char": -0.6399792353312175, "num_chars": 15}, {"sum_logits": -16.03013038635254, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -27.377422332763672, "logits_per_token": -4.007532596588135, "logits_per_char": -0.9429488462560317, "num_chars": 17}, {"sum_logits": -12.313142776489258, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.973651885986328, "logits_per_token": -3.0782856941223145, "logits_per_char": -0.6480601461310136, "num_chars": 19}, {"sum_logits": -15.7815523147583, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -28.31760025024414, "logits_per_token": -3.945388078689575, "logits_per_char": -0.9863470196723938, "num_chars": 16}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 190, "native_id": "Mercury_SC_401818", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.348915100097656, "incorrect_loss_raw": 5.642869710922241, "correct_loss_per_char": 0.445742925008138, "incorrect_loss_per_char": 0.4534235189487408, "correct_loss_per_token": 2.674457550048828, "incorrect_loss_per_token": 2.2327120966381497, "correct_loss_uncond": -12.873743057250977, "incorrect_loss_uncond": -12.682654937108358}, "model_output": [{"sum_logits": -3.0515477657318115, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -17.891754150390625, "logits_per_token": -1.5257738828659058, "logits_per_char": -0.30515477657318113, "num_chars": 10}, {"sum_logits": -3.2800517082214355, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -19.24686050415039, "logits_per_token": -1.6400258541107178, "logits_per_char": -0.29818651892922143, "num_chars": 11}, {"sum_logits": -5.348915100097656, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -18.222658157348633, "logits_per_token": -2.674457550048828, "logits_per_char": -0.445742925008138, "num_chars": 12}, {"sum_logits": -10.597009658813477, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.83795928955078, "logits_per_token": -3.5323365529378257, "logits_per_char": -0.7569292613438198, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 191, "native_id": "Mercury_SC_413549", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.183931350708008, "incorrect_loss_raw": 14.700742721557617, "correct_loss_per_char": 0.6601709282916525, "incorrect_loss_per_char": 0.7160466858834932, "correct_loss_per_token": 2.5306552251180015, "incorrect_loss_per_token": 2.4501237869262695, "correct_loss_uncond": -11.121833801269531, "incorrect_loss_uncond": -11.267386754353842}, "model_output": [{"sum_logits": -13.500946044921875, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -23.84095001220703, "logits_per_token": -2.250157674153646, "logits_per_char": -0.6750473022460938, "num_chars": 20}, {"sum_logits": -15.183931350708008, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -26.30576515197754, "logits_per_token": -2.5306552251180015, "logits_per_char": -0.6601709282916525, "num_chars": 23}, {"sum_logits": -12.533697128295898, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -26.695392608642578, "logits_per_token": -2.08894952138265, "logits_per_char": -0.5697135058316317, "num_chars": 22}, {"sum_logits": -18.067584991455078, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -27.368045806884766, "logits_per_token": -3.011264165242513, "logits_per_char": -0.903379249572754, "num_chars": 20}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 192, "native_id": "Mercury_7093958", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.314274787902832, "incorrect_loss_raw": 10.741950511932373, "correct_loss_per_char": 0.489074987523696, "incorrect_loss_per_char": 0.52242340360369, "correct_loss_per_token": 2.771424929300944, "incorrect_loss_per_token": 2.8928166230519614, "correct_loss_uncond": -16.938612937927246, "incorrect_loss_uncond": -11.522812366485596}, "model_output": [{"sum_logits": -8.314274787902832, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.252887725830078, "logits_per_token": -2.771424929300944, "logits_per_char": -0.489074987523696, "num_chars": 17}, {"sum_logits": -7.463843822479248, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.02855682373047, "logits_per_token": -2.487947940826416, "logits_per_char": -0.3554211344037737, "num_chars": 21}, {"sum_logits": -13.736457824707031, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.40859603881836, "logits_per_token": -3.434114456176758, "logits_per_char": -0.6868228912353516, "num_chars": 20}, {"sum_logits": -11.02554988861084, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.357135772705078, "logits_per_token": -2.75638747215271, "logits_per_char": -0.5250261851719448, "num_chars": 21}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 193, "native_id": "Mercury_7102323", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.5483851432800293, "incorrect_loss_raw": 5.922760009765625, "correct_loss_per_char": 0.149905008428237, "incorrect_loss_per_char": 0.38169622256941066, "correct_loss_per_token": 0.8494617144266764, "incorrect_loss_per_token": 1.9742533365885417, "correct_loss_uncond": -17.81108522415161, "incorrect_loss_uncond": -13.12166976928711}, "model_output": [{"sum_logits": -8.819095611572266, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -19.058368682861328, "logits_per_token": -2.9396985371907554, "logits_per_char": -0.6299354008265904, "num_chars": 14}, {"sum_logits": -4.473214149475098, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -17.341476440429688, "logits_per_token": -1.491071383158366, "logits_per_char": -0.2795758843421936, "num_chars": 16}, {"sum_logits": -4.475970268249512, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -20.733444213867188, "logits_per_token": -1.491990089416504, "logits_per_char": -0.235577382539448, "num_chars": 19}, {"sum_logits": -2.5483851432800293, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -20.35947036743164, "logits_per_token": -0.8494617144266764, "logits_per_char": -0.149905008428237, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 194, "native_id": "Mercury_7222793", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.79914665222168, "incorrect_loss_raw": 23.967230478922527, "correct_loss_per_char": 0.45414888858795166, "incorrect_loss_per_char": 0.540998814658128, "correct_loss_per_token": 2.4221274058024087, "incorrect_loss_per_token": 3.170521509079706, "correct_loss_uncond": -21.658205032348633, "incorrect_loss_uncond": -14.79031499226888}, "model_output": [{"sum_logits": -24.775911331176758, "num_tokens": 7, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -41.68478012084961, "logits_per_token": -3.5394159044538225, "logits_per_char": -0.6696192251669394, "num_chars": 37}, {"sum_logits": -21.79914665222168, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -43.45735168457031, "logits_per_token": -2.4221274058024087, "logits_per_char": -0.45414888858795166, "num_chars": 48}, {"sum_logits": -23.182451248168945, "num_tokens": 7, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -36.823551177978516, "logits_per_token": -3.3117787497384206, "logits_per_char": -0.4545578676111558, "num_chars": 51}, {"sum_logits": -23.943328857421875, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -37.764305114746094, "logits_per_token": -2.660369873046875, "logits_per_char": -0.49881935119628906, "num_chars": 48}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 195, "native_id": "Mercury_SC_400701", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 13.771407127380371, "incorrect_loss_raw": 20.59935251871745, "correct_loss_per_char": 0.3358879787165944, "incorrect_loss_per_char": 0.5024232321638402, "correct_loss_per_token": 1.967343875340053, "incorrect_loss_per_token": 2.7034881682623, "correct_loss_uncond": -19.812214851379395, "incorrect_loss_uncond": -18.17816162109375}, "model_output": [{"sum_logits": -13.771407127380371, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -33.583621978759766, "logits_per_token": -1.967343875340053, "logits_per_char": -0.3358879787165944, "num_chars": 41}, {"sum_logits": -23.73058319091797, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -39.751121520996094, "logits_per_token": -2.966322898864746, "logits_per_char": -0.578794711973609, "num_chars": 41}, {"sum_logits": -21.599609375, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -36.653709411621094, "logits_per_token": -3.085658482142857, "logits_per_char": -0.5268197408536586, "num_chars": 41}, {"sum_logits": -16.467864990234375, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -39.927711486816406, "logits_per_token": -2.058483123779297, "logits_per_char": -0.40165524366425304, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 196, "native_id": "Mercury_409301", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.17500877380371, "incorrect_loss_raw": 19.829421361287434, "correct_loss_per_char": 0.4782897045737819, "incorrect_loss_per_char": 0.5497235574128427, "correct_loss_per_token": 2.0194454193115234, "incorrect_loss_per_token": 2.6504349784245567, "correct_loss_uncond": -21.740468978881836, "incorrect_loss_uncond": -18.0568052927653}, "model_output": [{"sum_logits": -17.231082916259766, "num_tokens": 9, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -41.20066833496094, "logits_per_token": -1.9145647684733074, "logits_per_char": -0.4418226388784555, "num_chars": 39}, {"sum_logits": -18.17500877380371, "num_tokens": 9, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -39.91547775268555, "logits_per_token": -2.0194454193115234, "logits_per_char": -0.4782897045737819, "num_chars": 38}, {"sum_logits": -21.38513946533203, "num_tokens": 7, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -36.442344665527344, "logits_per_token": -3.0550199236188615, "logits_per_char": -0.6110039847237724, "num_chars": 35}, {"sum_logits": -20.872041702270508, "num_tokens": 7, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -36.01566696166992, "logits_per_token": -2.9817202431815013, "logits_per_char": -0.5963440486363002, "num_chars": 35}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 197, "native_id": "Mercury_SC_400383", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.777023315429688, "incorrect_loss_raw": 24.77289581298828, "correct_loss_per_char": 0.5991794041224888, "incorrect_loss_per_char": 0.8156413463558633, "correct_loss_per_token": 4.194255828857422, "incorrect_loss_per_token": 4.84360610871088, "correct_loss_uncond": -12.230060577392578, "incorrect_loss_uncond": -8.502713521321615}, "model_output": [{"sum_logits": -34.744327545166016, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -41.97389221191406, "logits_per_token": -4.343040943145752, "logits_per_char": -0.9143244090833162, "num_chars": 38}, {"sum_logits": -15.769298553466797, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.78335189819336, "logits_per_token": -2.2527569362095425, "logits_per_char": -0.45055138724190846, "num_chars": 35}, {"sum_logits": -16.777023315429688, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -29.007083892822266, "logits_per_token": -4.194255828857422, "logits_per_char": -0.5991794041224888, "num_chars": 28}, {"sum_logits": -23.80506134033203, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.069583892822266, "logits_per_token": -7.935020446777344, "logits_per_char": -1.082048242742365, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 198, "native_id": "CSZ_2005_5_CSZ10021", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.292539596557617, "incorrect_loss_raw": 23.29128583272298, "correct_loss_per_char": 0.4695096356528146, "incorrect_loss_per_char": 0.46388337092643694, "correct_loss_per_token": 2.1910449663798013, "incorrect_loss_per_token": 2.2868257805153176, "correct_loss_uncond": -16.291494369506836, "incorrect_loss_uncond": -14.025883992513021}, "model_output": [{"sum_logits": -18.112003326416016, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -31.643888473510742, "logits_per_token": -2.012444814046224, "logits_per_char": -0.4212093796840934, "num_chars": 43}, {"sum_logits": -26.292539596557617, "num_tokens": 12, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -42.58403396606445, "logits_per_token": -2.1910449663798013, "logits_per_char": -0.4695096356528146, "num_chars": 56}, {"sum_logits": -32.518245697021484, "num_tokens": 12, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -43.51718521118164, "logits_per_token": -2.7098538080851235, "logits_per_char": -0.5330859950331391, "num_chars": 61}, {"sum_logits": -19.243608474731445, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -36.790435791015625, "logits_per_token": -2.138178719414605, "logits_per_char": -0.4373547380620783, "num_chars": 44}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 199, "native_id": "Mercury_SC_407070", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.160449981689453, "incorrect_loss_raw": 25.900888442993164, "correct_loss_per_char": 0.6155680550469292, "incorrect_loss_per_char": 0.6881738691111945, "correct_loss_per_token": 2.2160449981689454, "incorrect_loss_per_token": 3.436801098011158, "correct_loss_uncond": -19.175926208496094, "incorrect_loss_uncond": -9.43100674947103}, "model_output": [{"sum_logits": -21.193912506103516, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -31.992876052856445, "logits_per_token": -3.5323187510172525, "logits_per_char": -0.7064637502034505, "num_chars": 30}, {"sum_logits": -22.160449981689453, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -41.33637619018555, "logits_per_token": -2.2160449981689454, "logits_per_char": -0.6155680550469292, "num_chars": 36}, {"sum_logits": -20.55668830871582, "num_tokens": 9, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.14756202697754, "logits_per_token": -2.2840764787462025, "logits_per_char": -0.540965481808311, "num_chars": 38}, {"sum_logits": -35.952064514160156, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -44.855247497558594, "logits_per_token": -4.4940080642700195, "logits_per_char": -0.8170923753218218, "num_chars": 44}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 200, "native_id": "Mercury_SC_400708", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 22.978042602539062, "incorrect_loss_raw": 23.21665636698405, "correct_loss_per_char": 0.47870922088623047, "incorrect_loss_per_char": 0.7090146123355799, "correct_loss_per_token": 2.5531158447265625, "incorrect_loss_per_token": 3.166813464391799, "correct_loss_uncond": -14.701000213623047, "incorrect_loss_uncond": -8.911086400349935}, "model_output": [{"sum_logits": -17.77225685119629, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.09612274169922, "logits_per_token": -2.538893835885184, "logits_per_char": -0.5732986081031061, "num_chars": 31}, {"sum_logits": -26.702621459960938, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.672603607177734, "logits_per_token": -3.814660208565848, "logits_per_char": -0.8344569206237793, "num_chars": 32}, {"sum_logits": -22.978042602539062, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -37.67904281616211, "logits_per_token": -2.5531158447265625, "logits_per_char": -0.47870922088623047, "num_chars": 48}, {"sum_logits": -25.175090789794922, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -30.614501953125, "logits_per_token": -3.1468863487243652, "logits_per_char": -0.7192883082798549, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 201, "native_id": "Mercury_7075040", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.791074752807617, "incorrect_loss_raw": 18.452123324076336, "correct_loss_per_char": 0.2742110407629678, "incorrect_loss_per_char": 0.43914654021169625, "correct_loss_per_token": 1.9651791254679363, "incorrect_loss_per_token": 3.093702319311717, "correct_loss_uncond": -20.406076431274414, "incorrect_loss_uncond": -17.819492022196453}, "model_output": [{"sum_logits": -14.961424827575684, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -33.437477111816406, "logits_per_token": -2.992284965515137, "logits_per_char": -0.49871416091918946, "num_chars": 30}, {"sum_logits": -21.760852813720703, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -36.18521499633789, "logits_per_token": -3.6268088022867837, "logits_per_char": -0.45335110028584796, "num_chars": 48}, {"sum_logits": -11.791074752807617, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -32.19715118408203, "logits_per_token": -1.9651791254679363, "logits_per_char": -0.2742110407629678, "num_chars": 43}, {"sum_logits": -18.634092330932617, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -39.19215393066406, "logits_per_token": -2.662013190133231, "logits_per_char": -0.3653743594300513, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 202, "native_id": "Mercury_7137165", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.941716194152832, "incorrect_loss_raw": 7.702091375986735, "correct_loss_per_char": 0.3268573549058702, "incorrect_loss_per_char": 0.6877790643711282, "correct_loss_per_token": 1.470858097076416, "incorrect_loss_per_token": 3.8510456879933677, "correct_loss_uncond": -10.526040077209473, "incorrect_loss_uncond": -6.199181079864502}, "model_output": [{"sum_logits": -2.941716194152832, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": true, "sum_logits_uncond": -13.467756271362305, "logits_per_token": -1.470858097076416, "logits_per_char": -0.3268573549058702, "num_chars": 9}, {"sum_logits": -5.026069164276123, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -12.765954971313477, "logits_per_token": -2.5130345821380615, "logits_per_char": -0.5026069164276123, "num_chars": 10}, {"sum_logits": -10.946063041687012, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.133918762207031, "logits_per_token": -5.473031520843506, "logits_per_char": -0.9121719201405843, "num_chars": 12}, {"sum_logits": -7.13414192199707, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.803943634033203, "logits_per_token": -3.567070960998535, "logits_per_char": -0.6485583565451882, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 203, "native_id": "Mercury_SC_400046", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.02922821044922, "incorrect_loss_raw": 18.27303632100423, "correct_loss_per_char": 0.548314957391648, "incorrect_loss_per_char": 0.5813150772972712, "correct_loss_per_token": 2.8786535263061523, "incorrect_loss_per_token": 2.4314490892154312, "correct_loss_uncond": -18.09157943725586, "incorrect_loss_uncond": -14.575044631958008}, "model_output": [{"sum_logits": -16.622726440429688, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -29.665203094482422, "logits_per_token": -2.7704544067382812, "logits_per_char": -0.7915584019252232, "num_chars": 21}, {"sum_logits": -23.02922821044922, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -41.12080764770508, "logits_per_token": -2.8786535263061523, "logits_per_char": -0.548314957391648, "num_chars": 42}, {"sum_logits": -20.241268157958984, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -36.742332458496094, "logits_per_token": -2.8916097368512834, "logits_per_char": -0.5783219473702567, "num_chars": 35}, {"sum_logits": -17.955114364624023, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.1367073059082, "logits_per_token": -1.6322831240567295, "logits_per_char": -0.3740648825963338, "num_chars": 48}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 204, "native_id": "Mercury_7099330", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.306135177612305, "incorrect_loss_raw": 11.050718943277994, "correct_loss_per_char": 0.39553024655296687, "incorrect_loss_per_char": 0.5837925527610031, "correct_loss_per_token": 4.153067588806152, "incorrect_loss_per_token": 5.525359471638997, "correct_loss_uncond": -8.942941665649414, "incorrect_loss_uncond": -7.562426249186198}, "model_output": [{"sum_logits": -10.627245903015137, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -18.814678192138672, "logits_per_token": -5.313622951507568, "logits_per_char": -0.6251321119420669, "num_chars": 17}, {"sum_logits": -15.325065612792969, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -19.55876922607422, "logits_per_token": -7.662532806396484, "logits_per_char": -0.7662532806396485, "num_chars": 20}, {"sum_logits": -7.199845314025879, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.465988159179688, "logits_per_token": -3.5999226570129395, "logits_per_char": -0.35999226570129395, "num_chars": 20}, {"sum_logits": -8.306135177612305, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.24907684326172, "logits_per_token": -4.153067588806152, "logits_per_char": -0.39553024655296687, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 205, "native_id": "MDSA_2007_5_2", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.154410362243652, "incorrect_loss_raw": 7.110759735107422, "correct_loss_per_char": 0.6795341968536377, "incorrect_loss_per_char": 0.6496077961391874, "correct_loss_per_token": 2.038602590560913, "incorrect_loss_per_token": 2.0629123581780324, "correct_loss_uncond": -14.081095695495605, "incorrect_loss_uncond": -11.294871012369791}, "model_output": [{"sum_logits": -10.268007278442383, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -20.484207153320312, "logits_per_token": -3.4226690928141275, "logits_per_char": -1.0268007278442384, "num_chars": 10}, {"sum_logits": -3.627694606781006, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": true, "sum_logits_uncond": -16.198484420776367, "logits_per_token": -0.9069236516952515, "logits_per_char": -0.3023078838984172, "num_chars": 12}, {"sum_logits": -8.154410362243652, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.235506057739258, "logits_per_token": -2.038602590560913, "logits_per_char": -0.6795341968536377, "num_chars": 12}, {"sum_logits": -7.436577320098877, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.53420066833496, "logits_per_token": -1.8591443300247192, "logits_per_char": -0.6197147766749064, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 206, "native_id": "Mercury_7271758", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.749618530273438, "incorrect_loss_raw": 22.77609380086263, "correct_loss_per_char": 0.37726579145951705, "incorrect_loss_per_char": 0.4816774835966446, "correct_loss_per_token": 2.074961853027344, "incorrect_loss_per_token": 2.9729199523017518, "correct_loss_uncond": -18.108497619628906, "incorrect_loss_uncond": -14.697205861409506}, "model_output": [{"sum_logits": -23.53748321533203, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -41.32744598388672, "logits_per_token": -2.942185401916504, "logits_per_char": -0.5007975152198304, "num_chars": 47}, {"sum_logits": -20.749618530273438, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -38.858116149902344, "logits_per_token": -2.074961853027344, "logits_per_char": -0.37726579145951705, "num_chars": 55}, {"sum_logits": -23.638216018676758, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -38.1449089050293, "logits_per_token": -2.9547770023345947, "logits_per_char": -0.46349443173876, "num_chars": 51}, {"sum_logits": -21.1525821685791, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.94754409790039, "logits_per_token": -3.0217974526541576, "logits_per_char": -0.4807405038313432, "num_chars": 44}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 207, "native_id": "MCAS_2003_8_31", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.97162628173828, "incorrect_loss_raw": 27.645774205525715, "correct_loss_per_char": 0.3059939722861013, "incorrect_loss_per_char": 0.5196502055984095, "correct_loss_per_token": 1.897162628173828, "incorrect_loss_per_token": 2.7645774205525715, "correct_loss_uncond": -22.87871551513672, "incorrect_loss_uncond": -22.856799443562824}, "model_output": [{"sum_logits": -16.89875602722168, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -41.12091827392578, "logits_per_token": -1.689875602722168, "logits_per_char": -0.3129399264300311, "num_chars": 54}, {"sum_logits": -30.457427978515625, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -48.595333099365234, "logits_per_token": -3.0457427978515623, "logits_per_char": -0.5746684524248231, "num_chars": 53}, {"sum_logits": -18.97162628173828, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -41.850341796875, "logits_per_token": -1.897162628173828, "logits_per_char": -0.3059939722861013, "num_chars": 62}, {"sum_logits": -35.581138610839844, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -61.79146957397461, "logits_per_token": -3.558113861083984, "logits_per_char": -0.6713422379403744, "num_chars": 53}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 208, "native_id": "AKDE&ED_2008_8_53", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 38.48111343383789, "incorrect_loss_raw": 34.87187957763672, "correct_loss_per_char": 0.6522222615904727, "incorrect_loss_per_char": 0.5786735299134236, "correct_loss_per_token": 3.8481113433837892, "incorrect_loss_per_token": 3.487187957763672, "correct_loss_uncond": -4.824817657470703, "incorrect_loss_uncond": -10.83099110921224}, "model_output": [{"sum_logits": -33.96365737915039, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -39.129920959472656, "logits_per_token": -3.396365737915039, "logits_per_char": -0.6064938817705426, "num_chars": 56}, {"sum_logits": -33.30046844482422, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -47.90392303466797, "logits_per_token": -3.330046844482422, "logits_per_char": -0.5459093187676102, "num_chars": 61}, {"sum_logits": -38.48111343383789, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -43.305931091308594, "logits_per_token": -3.8481113433837892, "logits_per_char": -0.6522222615904727, "num_chars": 59}, {"sum_logits": -37.35151290893555, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -50.07476806640625, "logits_per_token": -3.7351512908935547, "logits_per_char": -0.5836173892021179, "num_chars": 64}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 209, "native_id": "TIMSS_2007_8_pg109", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.51982307434082, "incorrect_loss_raw": 10.399162292480469, "correct_loss_per_char": 0.5485630035400391, "incorrect_loss_per_char": 0.38134766886473964, "correct_loss_per_token": 2.879955768585205, "incorrect_loss_per_token": 2.4255268891652424, "correct_loss_uncond": -14.56104850769043, "incorrect_loss_uncond": -15.153043111165365}, "model_output": [{"sum_logits": -10.45582103729248, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.09762954711914, "logits_per_token": -2.091164207458496, "logits_per_char": -0.3872526310108326, "num_chars": 27}, {"sum_logits": -12.108546257019043, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.627079010009766, "logits_per_token": -3.0271365642547607, "logits_per_char": -0.4484646761858905, "num_chars": 27}, {"sum_logits": -11.51982307434082, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -26.08087158203125, "logits_per_token": -2.879955768585205, "logits_per_char": -0.5485630035400391, "num_chars": 21}, {"sum_logits": -8.633119583129883, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.931907653808594, "logits_per_token": -2.1582798957824707, "logits_per_char": -0.3083256993974958, "num_chars": 28}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 210, "native_id": "Mercury_175385", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.045379638671875, "incorrect_loss_raw": 19.87182108561198, "correct_loss_per_char": 0.5609075927734375, "incorrect_loss_per_char": 0.49800347618342905, "correct_loss_per_token": 2.8045379638671877, "incorrect_loss_per_token": 2.785582288106282, "correct_loss_uncond": -10.600887298583984, "incorrect_loss_uncond": -12.440150578816732}, "model_output": [{"sum_logits": -12.064186096191406, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -20.944547653198242, "logits_per_token": -2.4128372192382814, "logits_per_char": -0.4308637891496931, "num_chars": 28}, {"sum_logits": -19.39971160888672, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -36.50345993041992, "logits_per_token": -2.42496395111084, "logits_per_char": -0.4511560839275981, "num_chars": 43}, {"sum_logits": -28.045379638671875, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -38.64626693725586, "logits_per_token": -2.8045379638671877, "logits_per_char": -0.5609075927734375, "num_chars": 50}, {"sum_logits": -28.151565551757812, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -39.48790740966797, "logits_per_token": -3.5189456939697266, "logits_per_char": -0.611990555472996, "num_chars": 46}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 211, "native_id": "Mercury_410669", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.561859130859375, "incorrect_loss_raw": 14.330877304077148, "correct_loss_per_char": 1.3801549275716145, "incorrect_loss_per_char": 1.3028070276433772, "correct_loss_per_token": 2.070232391357422, "incorrect_loss_per_token": 1.7913596630096436, "correct_loss_uncond": -12.603164672851562, "incorrect_loss_uncond": -12.768758773803711}, "model_output": [{"sum_logits": -13.154556274414062, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -26.54244613647461, "logits_per_token": -1.6443195343017578, "logits_per_char": -1.1958687522194602, "num_chars": 11}, {"sum_logits": -14.906524658203125, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -26.758209228515625, "logits_per_token": -1.8633155822753906, "logits_per_char": -1.3551386052911931, "num_chars": 11}, {"sum_logits": -14.931550979614258, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -27.998252868652344, "logits_per_token": -1.8664438724517822, "logits_per_char": -1.357413725419478, "num_chars": 11}, {"sum_logits": -16.561859130859375, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -29.165023803710938, "logits_per_token": -2.070232391357422, "logits_per_char": -1.3801549275716145, "num_chars": 12}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 212, "native_id": "MEAP_2005_8_39", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.303913116455078, "incorrect_loss_raw": 16.928736368815105, "correct_loss_per_char": 0.47322742755596453, "incorrect_loss_per_char": 0.5857977724796715, "correct_loss_per_token": 2.050652186075846, "incorrect_loss_per_token": 2.76584882887583, "correct_loss_uncond": -16.678913116455078, "incorrect_loss_uncond": -17.759387334187824}, "model_output": [{"sum_logits": -22.783836364746094, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -35.48043441772461, "logits_per_token": -3.254833766392299, "logits_per_char": -0.6904192837801847, "num_chars": 33}, {"sum_logits": -16.73285484313965, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -40.667991638183594, "logits_per_token": -2.788809140523275, "logits_per_char": -0.5769949945910223, "num_chars": 29}, {"sum_logits": -12.303913116455078, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.982826232910156, "logits_per_token": -2.050652186075846, "logits_per_char": -0.47322742755596453, "num_chars": 26}, {"sum_logits": -11.26951789855957, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -27.915945053100586, "logits_per_token": -2.2539035797119142, "logits_per_char": -0.48997903906780743, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 213, "native_id": "Mercury_SC_408568", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.343989372253418, "incorrect_loss_raw": 8.808138529459635, "correct_loss_per_char": 0.2748232168309829, "incorrect_loss_per_char": 0.25052084418222936, "correct_loss_per_token": 1.5573315620422363, "incorrect_loss_per_token": 1.1316055286498297, "correct_loss_uncond": -21.990426063537598, "incorrect_loss_uncond": -19.506201426188152}, "model_output": [{"sum_logits": -5.138819694519043, "num_tokens": 7, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -30.053991317749023, "logits_per_token": -0.7341170992170062, "logits_per_char": -0.16576837724254978, "num_chars": 31}, {"sum_logits": -9.343989372253418, "num_tokens": 6, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -31.334415435791016, "logits_per_token": -1.5573315620422363, "logits_per_char": -0.2748232168309829, "num_chars": 34}, {"sum_logits": -13.996362686157227, "num_tokens": 8, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -31.574520111083984, "logits_per_token": -1.7495453357696533, "logits_per_char": -0.3887878523932563, "num_chars": 36}, {"sum_logits": -7.289233207702637, "num_tokens": 8, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -23.31450843811035, "logits_per_token": -0.9111541509628296, "logits_per_char": -0.1970063029108821, "num_chars": 37}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 214, "native_id": "AKDE&ED_2008_8_7", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 41.03302764892578, "incorrect_loss_raw": 49.67198181152344, "correct_loss_per_char": 0.506580588258343, "incorrect_loss_per_char": 0.6124748549650416, "correct_loss_per_token": 2.9309305463518416, "incorrect_loss_per_token": 3.196223637414357, "correct_loss_uncond": -19.567047119140625, "incorrect_loss_uncond": -14.807692209879557}, "model_output": [{"sum_logits": -47.68398666381836, "num_tokens": 14, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -59.31637191772461, "logits_per_token": -3.4059990474155972, "logits_per_char": -0.6716054459692726, "num_chars": 71}, {"sum_logits": -41.03302764892578, "num_tokens": 14, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -60.600074768066406, "logits_per_token": -2.9309305463518416, "logits_per_char": -0.506580588258343, "num_chars": 81}, {"sum_logits": -51.55128479003906, "num_tokens": 18, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -67.15060424804688, "logits_per_token": -2.8639602661132812, "logits_per_char": -0.6064857034122243, "num_chars": 85}, {"sum_logits": -49.78067398071289, "num_tokens": 15, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -66.9720458984375, "logits_per_token": -3.318711598714193, "logits_per_char": -0.559333415513628, "num_chars": 89}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 215, "native_id": "Mercury_7082845", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.44039249420166, "incorrect_loss_raw": 16.652442296346027, "correct_loss_per_char": 0.1776156997680664, "incorrect_loss_per_char": 0.5046194635256372, "correct_loss_per_token": 0.8880784988403321, "incorrect_loss_per_token": 2.676594706944057, "correct_loss_uncond": -25.873909950256348, "incorrect_loss_uncond": -17.118080774943035}, "model_output": [{"sum_logits": -15.608329772949219, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.81064987182617, "logits_per_token": -2.229761396135603, "logits_per_char": -0.47297969008937024, "num_chars": 33}, {"sum_logits": -4.44039249420166, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": true, "sum_logits_uncond": -30.314302444458008, "logits_per_token": -0.8880784988403321, "logits_per_char": -0.1776156997680664, "num_chars": 25}, {"sum_logits": -15.627904891967773, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.595840454101562, "logits_per_token": -3.125580978393555, "logits_per_char": -0.47357287551417493, "num_chars": 33}, {"sum_logits": -18.721092224121094, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.90507888793945, "logits_per_token": -2.6744417463030135, "logits_per_char": -0.5673058249733665, "num_chars": 33}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 216, "native_id": "Mercury_SC_405726", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.762062072753906, "incorrect_loss_raw": 20.52337392171224, "correct_loss_per_char": 0.5463700545461554, "incorrect_loss_per_char": 0.6232812532207422, "correct_loss_per_token": 3.460343678792318, "incorrect_loss_per_token": 4.176620843675402, "correct_loss_uncond": -16.10816192626953, "incorrect_loss_uncond": -11.36442756652832}, "model_output": [{"sum_logits": -16.203941345214844, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -24.274599075317383, "logits_per_token": -4.050985336303711, "logits_per_char": -0.5401313781738282, "num_chars": 30}, {"sum_logits": -27.53541374206543, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -37.003684997558594, "logits_per_token": -5.507082748413086, "logits_per_char": -0.8604816794395447, "num_chars": 32}, {"sum_logits": -17.830766677856445, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -34.3851203918457, "logits_per_token": -2.9717944463094077, "logits_per_char": -0.4692307020488538, "num_chars": 38}, {"sum_logits": -20.762062072753906, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -36.87022399902344, "logits_per_token": -3.460343678792318, "logits_per_char": -0.5463700545461554, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 217, "native_id": "Mercury_SC_415407", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 38.14814758300781, "incorrect_loss_raw": 37.30140940348307, "correct_loss_per_char": 0.8477366129557292, "incorrect_loss_per_char": 0.8543766005300016, "correct_loss_per_token": 3.8148147583007814, "incorrect_loss_per_token": 3.506240405458392, "correct_loss_uncond": -15.138404846191406, "incorrect_loss_uncond": -13.327624003092447}, "model_output": [{"sum_logits": -38.14814758300781, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -53.28655242919922, "logits_per_token": -3.8148147583007814, "logits_per_char": -0.8477366129557292, "num_chars": 45}, {"sum_logits": -36.656288146972656, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -49.288185119628906, "logits_per_token": -3.3323898315429688, "logits_per_char": -0.8524718173714572, "num_chars": 43}, {"sum_logits": -37.23088836669922, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -49.8087043762207, "logits_per_token": -3.3846262151544746, "logits_per_char": -0.8658346131790516, "num_chars": 43}, {"sum_logits": -38.017051696777344, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -52.79021072387695, "logits_per_token": -3.801705169677734, "logits_per_char": -0.8448233710394966, "num_chars": 45}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 218, "native_id": "Mercury_SC_401792", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.365632057189941, "incorrect_loss_raw": 4.835485537846883, "correct_loss_per_char": 0.7276053428649902, "incorrect_loss_per_char": 0.7780954480171204, "correct_loss_per_token": 4.365632057189941, "incorrect_loss_per_token": 4.835485537846883, "correct_loss_uncond": -7.690892219543457, "incorrect_loss_uncond": -7.264920473098755}, "model_output": [{"sum_logits": -2.8927342891693115, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -11.566265106201172, "logits_per_token": -2.8927342891693115, "logits_per_char": -0.7231835722923279, "num_chars": 4}, {"sum_logits": -3.607753276824951, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -3.607753276824951, "logits_per_char": -0.7215506553649902, "num_chars": 5}, {"sum_logits": -4.365632057189941, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -12.056524276733398, "logits_per_token": -4.365632057189941, "logits_per_char": -0.7276053428649902, "num_chars": 6}, {"sum_logits": -8.005969047546387, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -12.27128791809082, "logits_per_token": -8.005969047546387, "logits_per_char": -0.889552116394043, "num_chars": 9}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 219, "native_id": "LEAP_2000_8_4", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.90296745300293, "incorrect_loss_raw": 13.153754552205404, "correct_loss_per_char": 0.4184991435000771, "incorrect_loss_per_char": 0.380658070177091, "correct_loss_per_token": 2.271852493286133, "incorrect_loss_per_token": 1.8445555369059246, "correct_loss_uncond": -21.633386611938477, "incorrect_loss_uncond": -18.899559656778973}, "model_output": [{"sum_logits": -14.42420768737793, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -35.79737091064453, "logits_per_token": -2.4040346145629883, "logits_per_char": -0.4652970221734816, "num_chars": 31}, {"sum_logits": -15.90296745300293, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -37.536354064941406, "logits_per_token": -2.271852493286133, "logits_per_char": -0.4184991435000771, "num_chars": 38}, {"sum_logits": -12.333229064941406, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -28.75289535522461, "logits_per_token": -1.5416536331176758, "logits_per_char": -0.3333305152686867, "num_chars": 37}, {"sum_logits": -12.703826904296875, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -31.609676361083984, "logits_per_token": -1.5879783630371094, "logits_per_char": -0.3433466730891047, "num_chars": 37}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 220, "native_id": "Mercury_SC_413439", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.172245025634766, "incorrect_loss_raw": 27.048609415690105, "correct_loss_per_char": 0.5722228385306694, "incorrect_loss_per_char": 0.6553927727094777, "correct_loss_per_token": 2.6465306282043457, "incorrect_loss_per_token": 3.5579949787684853, "correct_loss_uncond": -15.516239166259766, "incorrect_loss_uncond": -10.903828938802084}, "model_output": [{"sum_logits": -21.172245025634766, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -36.68848419189453, "logits_per_token": -2.6465306282043457, "logits_per_char": -0.5722228385306694, "num_chars": 37}, {"sum_logits": -29.72235870361328, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -41.48649978637695, "logits_per_token": -4.2460512433733255, "logits_per_char": -0.6461382326872452, "num_chars": 46}, {"sum_logits": -25.23892593383789, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -36.96843338012695, "logits_per_token": -3.1548657417297363, "logits_per_char": -0.6309731483459473, "num_chars": 40}, {"sum_logits": -26.18454360961914, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -35.402381896972656, "logits_per_token": -3.2730679512023926, "logits_per_char": -0.6890669370952406, "num_chars": 38}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 221, "native_id": "ACTAAP_2014_7_13", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.72732162475586, "incorrect_loss_raw": 17.93682352701823, "correct_loss_per_char": 0.3939404805501302, "incorrect_loss_per_char": 0.3870024901285615, "correct_loss_per_token": 1.6115746931596235, "incorrect_loss_per_token": 1.742076862219608, "correct_loss_uncond": -10.13808822631836, "incorrect_loss_uncond": -13.059823354085287}, "model_output": [{"sum_logits": -17.02981185913086, "num_tokens": 11, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -30.594501495361328, "logits_per_token": -1.5481647144664417, "logits_per_char": -0.37844026353624133, "num_chars": 45}, {"sum_logits": -17.72732162475586, "num_tokens": 11, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -27.86540985107422, "logits_per_token": -1.6115746931596235, "logits_per_char": -0.3939404805501302, "num_chars": 45}, {"sum_logits": -18.70337677001953, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -33.373531341552734, "logits_per_token": -1.870337677001953, "logits_per_char": -0.39794418659616027, "num_chars": 47}, {"sum_logits": -18.077281951904297, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -29.021907806396484, "logits_per_token": -1.8077281951904296, "logits_per_char": -0.38462302025328293, "num_chars": 47}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 222, "native_id": "Mercury_SC_402638", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.839231967926025, "incorrect_loss_raw": 10.116774876912435, "correct_loss_per_char": 0.6030178436866174, "incorrect_loss_per_char": 0.6051761745867147, "correct_loss_per_token": 2.6130773226420083, "incorrect_loss_per_token": 3.058577669991388, "correct_loss_uncond": -21.015239238739014, "incorrect_loss_uncond": -10.155019760131836}, "model_output": [{"sum_logits": -7.839231967926025, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -28.85447120666504, "logits_per_token": -2.6130773226420083, "logits_per_char": -0.6030178436866174, "num_chars": 13}, {"sum_logits": -8.730802536010742, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.951499938964844, "logits_per_token": -4.365401268005371, "logits_per_char": -0.6716001950777494, "num_chars": 13}, {"sum_logits": -7.134585380554199, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -18.38727569580078, "logits_per_token": -1.1890975634257, "logits_per_char": -0.41968149297377644, "num_chars": 17}, {"sum_logits": -14.484936714172363, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.476608276367188, "logits_per_token": -3.621234178543091, "logits_per_char": -0.7242468357086181, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 223, "native_id": "Mercury_SC_406725", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.234220504760742, "incorrect_loss_raw": 17.687552134195965, "correct_loss_per_char": 0.6243930963369516, "incorrect_loss_per_char": 0.45727557265437385, "correct_loss_per_token": 2.7057034174601235, "incorrect_loss_per_token": 2.3663937591371083, "correct_loss_uncond": -13.013887405395508, "incorrect_loss_uncond": -17.042151133219402}, "model_output": [{"sum_logits": -16.234220504760742, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -29.24810791015625, "logits_per_token": -2.7057034174601235, "logits_per_char": -0.6243930963369516, "num_chars": 26}, {"sum_logits": -14.111867904663086, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.7547550201416, "logits_per_token": -2.0159811292375838, "logits_per_char": -0.3814018352611645, "num_chars": 37}, {"sum_logits": -12.00368881225586, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.66929817199707, "logits_per_token": -1.7148126874651228, "logits_per_char": -0.3637481458259351, "num_chars": 33}, {"sum_logits": -26.947099685668945, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -44.76505661010742, "logits_per_token": -3.368387460708618, "logits_per_char": -0.626676736876022, "num_chars": 43}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 224, "native_id": "NYSEDREGENTS_2015_4_29", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.9270339012145996, "incorrect_loss_raw": 6.872085889180501, "correct_loss_per_char": 0.2141148779127333, "incorrect_loss_per_char": 0.7114634574987949, "correct_loss_per_token": 1.9270339012145996, "incorrect_loss_per_token": 3.9738804499308267, "correct_loss_uncond": -12.098942279815674, "incorrect_loss_uncond": -7.495697975158691}, "model_output": [{"sum_logits": -1.9270339012145996, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -1.9270339012145996, "logits_per_char": -0.2141148779127333, "num_chars": 9}, {"sum_logits": -5.602993011474609, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -5.602993011474609, "logits_per_char": -0.9338321685791016, "num_chars": 6}, {"sum_logits": -7.8853607177734375, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.70631217956543, "logits_per_token": -3.9426803588867188, "logits_per_char": -0.6065662090594952, "num_chars": 13}, {"sum_logits": -7.127903938293457, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.508045196533203, "logits_per_token": -2.3759679794311523, "logits_per_char": -0.5939919948577881, "num_chars": 12}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 225, "native_id": "Mercury_406136", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.55767250061035, "incorrect_loss_raw": 26.280868530273438, "correct_loss_per_char": 0.44750466217865814, "incorrect_loss_per_char": 0.7234204057449777, "correct_loss_per_token": 2.7596120834350586, "incorrect_loss_per_token": 4.131894035944863, "correct_loss_uncond": -10.932535171508789, "incorrect_loss_uncond": -9.221510569254557}, "model_output": [{"sum_logits": -16.55767250061035, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -27.49020767211914, "logits_per_token": -2.7596120834350586, "logits_per_char": -0.44750466217865814, "num_chars": 37}, {"sum_logits": -23.916641235351562, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -35.19044876098633, "logits_per_token": -3.9861068725585938, "logits_per_char": -0.6293852956671464, "num_chars": 38}, {"sum_logits": -31.279590606689453, "num_tokens": 7, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -36.568695068359375, "logits_per_token": -4.468512943812779, "logits_per_char": -0.8453943407213366, "num_chars": 37}, {"sum_logits": -23.646373748779297, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -34.74799346923828, "logits_per_token": -3.9410622914632163, "logits_per_char": -0.69548158084645, "num_chars": 34}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 226, "native_id": "MSA_2012_5_23", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.297748565673828, "incorrect_loss_raw": 9.107613563537598, "correct_loss_per_char": 0.2689940134684245, "incorrect_loss_per_char": 0.3479364806491807, "correct_loss_per_token": 1.1297748565673829, "incorrect_loss_per_token": 1.5194350772433811, "correct_loss_uncond": -20.165393829345703, "incorrect_loss_uncond": -17.219591458638508}, "model_output": [{"sum_logits": -8.052804946899414, "num_tokens": 4, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -20.27859115600586, "logits_per_token": -2.0132012367248535, "logits_per_char": -0.4026402473449707, "num_chars": 20}, {"sum_logits": -9.271506309509277, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -24.294057846069336, "logits_per_token": -1.5452510515848796, "logits_per_char": -0.40310896997866424, "num_chars": 23}, {"sum_logits": -11.297748565673828, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -31.46314239501953, "logits_per_token": -1.1297748565673829, "logits_per_char": -0.2689940134684245, "num_chars": 42}, {"sum_logits": -9.998529434204102, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -34.408966064453125, "logits_per_token": -0.9998529434204102, "logits_per_char": -0.2380602246239072, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 227, "native_id": "Mercury_405873", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.968250274658203, "incorrect_loss_raw": 23.27859942118327, "correct_loss_per_char": 0.3384449199094611, "incorrect_loss_per_char": 0.453432543559741, "correct_loss_per_token": 1.9968250274658204, "incorrect_loss_per_token": 2.404395046940556, "correct_loss_uncond": -15.550891876220703, "incorrect_loss_uncond": -12.11051877339681}, "model_output": [{"sum_logits": -21.176044464111328, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.193050384521484, "logits_per_token": -2.117604446411133, "logits_per_char": -0.3921489715576172, "num_chars": 54}, {"sum_logits": -19.968250274658203, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -35.519142150878906, "logits_per_token": -1.9968250274658204, "logits_per_char": -0.3384449199094611, "num_chars": 59}, {"sum_logits": -20.664478302001953, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.94611358642578, "logits_per_token": -2.2960531446668835, "logits_per_char": -0.5166119575500489, "num_chars": 40}, {"sum_logits": -27.995275497436523, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -40.02819061279297, "logits_per_token": -2.7995275497436523, "logits_per_char": -0.45153670157155684, "num_chars": 62}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 228, "native_id": "Mercury_7043820", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.91288423538208, "incorrect_loss_raw": 8.976356188456217, "correct_loss_per_char": 0.4548372488755446, "incorrect_loss_per_char": 0.7168232997258505, "correct_loss_per_token": 1.47822105884552, "incorrect_loss_per_token": 3.678274008962843, "correct_loss_uncond": -13.747073650360107, "incorrect_loss_uncond": -9.047186215718588}, "model_output": [{"sum_logits": -4.476176738739014, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.775543212890625, "logits_per_token": -1.1190441846847534, "logits_per_char": -0.37301472822825116, "num_chars": 12}, {"sum_logits": -5.91288423538208, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.659957885742188, "logits_per_token": -1.47822105884552, "logits_per_char": -0.4548372488755446, "num_chars": 13}, {"sum_logits": -14.588883399963379, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.047714233398438, "logits_per_token": -7.2944416999816895, "logits_per_char": -1.2157402833302815, "num_chars": 12}, {"sum_logits": -7.86400842666626, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.24736976623535, "logits_per_token": -2.6213361422220864, "logits_per_char": -0.5617148876190186, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 229, "native_id": "MCAS_2005_5_34", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.715591430664062, "incorrect_loss_raw": 31.651744842529297, "correct_loss_per_char": 0.5065330736564867, "incorrect_loss_per_char": 0.8342811835733898, "correct_loss_per_token": 3.3431182861328126, "incorrect_loss_per_token": 4.13498095759639, "correct_loss_uncond": -13.745027542114258, "incorrect_loss_uncond": -6.046169281005859}, "model_output": [{"sum_logits": -26.283763885498047, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.250717163085938, "logits_per_token": -5.256752777099609, "logits_per_char": -1.0109139955960786, "num_chars": 26}, {"sum_logits": -16.715591430664062, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.46061897277832, "logits_per_token": -3.3431182861328126, "logits_per_char": -0.5065330736564867, "num_chars": 33}, {"sum_logits": -25.293872833251953, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -33.56517791748047, "logits_per_token": -2.8104303148057728, "logits_per_char": -0.5882296007733012, "num_chars": 43}, {"sum_logits": -43.37759780883789, "num_tokens": 10, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -49.27784729003906, "logits_per_token": -4.337759780883789, "logits_per_char": -0.9036999543507894, "num_chars": 48}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 230, "native_id": "Mercury_7182245", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 33.20711898803711, "incorrect_loss_raw": 26.933701197306316, "correct_loss_per_char": 0.5355986933554372, "incorrect_loss_per_char": 0.4612496489752555, "correct_loss_per_token": 2.554393768310547, "incorrect_loss_per_token": 2.2429791174846376, "correct_loss_uncond": -16.968647003173828, "incorrect_loss_uncond": -15.246407190958658}, "model_output": [{"sum_logits": -27.86301612854004, "num_tokens": 12, "num_tokens_all": 255, "is_greedy": false, "sum_logits_uncond": -45.40806579589844, "logits_per_token": -2.32191801071167, "logits_per_char": -0.472254510653221, "num_chars": 59}, {"sum_logits": -23.942401885986328, "num_tokens": 11, "num_tokens_all": 254, "is_greedy": false, "sum_logits_uncond": -33.2718620300293, "logits_per_token": -2.1765819896351206, "logits_per_char": -0.42004213835063736, "num_chars": 57}, {"sum_logits": -28.995685577392578, "num_tokens": 13, "num_tokens_all": 256, "is_greedy": false, "sum_logits_uncond": -47.86039733886719, "logits_per_token": -2.2304373521071215, "logits_per_char": -0.4914522979219081, "num_chars": 59}, {"sum_logits": -33.20711898803711, "num_tokens": 13, "num_tokens_all": 256, "is_greedy": false, "sum_logits_uncond": -50.17576599121094, "logits_per_token": -2.554393768310547, "logits_per_char": -0.5355986933554372, "num_chars": 62}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 231, "native_id": "MSA_2012_8_30", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.435481071472168, "incorrect_loss_raw": 7.626540501912435, "correct_loss_per_char": 0.8796523901132437, "incorrect_loss_per_char": 0.4966655584870192, "correct_loss_per_token": 5.717740535736084, "incorrect_loss_per_token": 3.8132702509562173, "correct_loss_uncond": -8.085530281066895, "incorrect_loss_uncond": -9.910298665364584}, "model_output": [{"sum_logits": -11.435481071472168, "num_tokens": 2, "num_tokens_all": 333, "is_greedy": false, "sum_logits_uncond": -19.521011352539062, "logits_per_token": -5.717740535736084, "logits_per_char": -0.8796523901132437, "num_chars": 13}, {"sum_logits": -6.146093845367432, "num_tokens": 2, "num_tokens_all": 333, "is_greedy": false, "sum_logits_uncond": -16.6238956451416, "logits_per_token": -3.073046922683716, "logits_per_char": -0.4390067032405308, "num_chars": 14}, {"sum_logits": -10.921459197998047, "num_tokens": 2, "num_tokens_all": 333, "is_greedy": false, "sum_logits_uncond": -17.760547637939453, "logits_per_token": -5.460729598999023, "logits_per_char": -0.7280972798665365, "num_chars": 15}, {"sum_logits": -5.812068462371826, "num_tokens": 2, "num_tokens_all": 333, "is_greedy": false, "sum_logits_uncond": -18.22607421875, "logits_per_token": -2.906034231185913, "logits_per_char": -0.3228926923539903, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 232, "native_id": "Mercury_7252753", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.874772548675537, "incorrect_loss_raw": 3.7279157638549805, "correct_loss_per_char": 0.4874772548675537, "incorrect_loss_per_char": 0.6297056720370339, "correct_loss_per_token": 2.4373862743377686, "incorrect_loss_per_token": 3.7279157638549805, "correct_loss_uncond": -9.673412799835205, "incorrect_loss_uncond": -8.270862261454264}, "model_output": [{"sum_logits": -4.874772548675537, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -14.548185348510742, "logits_per_token": -2.4373862743377686, "logits_per_char": -0.4874772548675537, "num_chars": 10}, {"sum_logits": -4.307025909423828, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -4.307025909423828, "logits_per_char": -0.6152894156319755, "num_chars": 7}, {"sum_logits": -3.0455002784729004, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -11.89634895324707, "logits_per_token": -3.0455002784729004, "logits_per_char": -0.5075833797454834, "num_chars": 6}, {"sum_logits": -3.831221103668213, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -11.723394393920898, "logits_per_token": -3.831221103668213, "logits_per_char": -0.7662442207336426, "num_chars": 5}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 233, "native_id": "TAKS_2009_8_36", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.022950172424316, "incorrect_loss_raw": 2.5822527408599854, "correct_loss_per_char": 2.011475086212158, "incorrect_loss_per_char": 1.2911263704299927, "correct_loss_per_token": 4.022950172424316, "incorrect_loss_per_token": 2.5822527408599854, "correct_loss_uncond": -1.5281620025634766, "incorrect_loss_uncond": -3.226480404535929}, "model_output": [{"sum_logits": -2.059453248977661, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": true, "sum_logits_uncond": -5.410497665405273, "logits_per_token": -2.059453248977661, "logits_per_char": -1.0297266244888306, "num_chars": 2}, {"sum_logits": -4.022950172424316, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -5.551112174987793, "logits_per_token": -4.022950172424316, "logits_per_char": -2.011475086212158, "num_chars": 2}, {"sum_logits": -2.383857011795044, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -2.383857011795044, "logits_per_char": -1.191928505897522, "num_chars": 2}, {"sum_logits": -3.303447961807251, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -6.237893581390381, "logits_per_token": -3.303447961807251, "logits_per_char": -1.6517239809036255, "num_chars": 2}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 234, "native_id": "Mercury_SC_415473", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.1218347549438477, "incorrect_loss_raw": 4.4568125406901045, "correct_loss_per_char": 1.5609173774719238, "incorrect_loss_per_char": 1.980077001783583, "correct_loss_per_token": 3.1218347549438477, "incorrect_loss_per_token": 4.4568125406901045, "correct_loss_uncond": -3.345059394836426, "incorrect_loss_uncond": -1.8603355089823406}, "model_output": [{"sum_logits": -4.576205253601074, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -5.410497665405273, "logits_per_token": -4.576205253601074, "logits_per_char": -2.288102626800537, "num_chars": 2}, {"sum_logits": -4.324305534362793, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -4.324305534362793, "logits_per_char": -2.1621527671813965, "num_chars": 2}, {"sum_logits": -3.1218347549438477, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -6.466894149780273, "logits_per_token": -3.1218347549438477, "logits_per_char": -1.5609173774719238, "num_chars": 2}, {"sum_logits": -4.469926834106445, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -7.763138294219971, "logits_per_token": -4.469926834106445, "logits_per_char": -1.4899756113688152, "num_chars": 3}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 235, "native_id": "Mercury_SC_413624", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 25.42254638671875, "incorrect_loss_raw": 25.403507232666016, "correct_loss_per_char": 0.6052987234933036, "incorrect_loss_per_char": 0.6892451686776563, "correct_loss_per_token": 2.8247273763020835, "incorrect_loss_per_token": 3.8003896872202554, "correct_loss_uncond": -12.29861068725586, "incorrect_loss_uncond": -12.987419128417969}, "model_output": [{"sum_logits": -22.482227325439453, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.42233657836914, "logits_per_token": -3.747037887573242, "logits_per_char": -0.6812796159224077, "num_chars": 33}, {"sum_logits": -25.42254638671875, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -37.72115707397461, "logits_per_token": -2.8247273763020835, "logits_per_char": -0.6052987234933036, "num_chars": 42}, {"sum_logits": -31.21402931213379, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -42.90385437011719, "logits_per_token": -3.9017536640167236, "logits_per_char": -0.7431911740984235, "num_chars": 42}, {"sum_logits": -22.514265060424805, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -36.846588134765625, "logits_per_token": -3.752377510070801, "logits_per_char": -0.6432647160121373, "num_chars": 35}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 236, "native_id": "Mercury_7016800", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.89275550842285, "incorrect_loss_raw": 18.694097836812336, "correct_loss_per_char": 0.4324512067048446, "incorrect_loss_per_char": 0.48085625257577985, "correct_loss_per_token": 2.210306167602539, "incorrect_loss_per_token": 2.1776227244624384, "correct_loss_uncond": -21.824769973754883, "incorrect_loss_uncond": -15.14394474029541}, "model_output": [{"sum_logits": -19.985633850097656, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -33.815303802490234, "logits_per_token": -2.220625983344184, "logits_per_char": -0.5401522662188556, "num_chars": 37}, {"sum_logits": -21.708160400390625, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.11554718017578, "logits_per_token": -2.713520050048828, "logits_per_char": -0.5427040100097656, "num_chars": 40}, {"sum_logits": -14.38849925994873, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.583276748657227, "logits_per_token": -1.5987221399943035, "logits_per_char": -0.35971248149871826, "num_chars": 40}, {"sum_logits": -19.89275550842285, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -41.717525482177734, "logits_per_token": -2.210306167602539, "logits_per_char": -0.4324512067048446, "num_chars": 46}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 237, "native_id": "Mercury_SC_407228", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.234045028686523, "incorrect_loss_raw": 14.45344066619873, "correct_loss_per_char": 0.553608726052677, "incorrect_loss_per_char": 0.5003448019272242, "correct_loss_per_token": 3.5292556285858154, "incorrect_loss_per_token": 3.1087726540035674, "correct_loss_uncond": -11.109640121459961, "incorrect_loss_uncond": -13.033689181009928}, "model_output": [{"sum_logits": -12.68692684173584, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -24.75046730041504, "logits_per_token": -4.228975613911946, "logits_per_char": -0.5074770736694336, "num_chars": 25}, {"sum_logits": -16.842239379882812, "num_tokens": 5, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -31.582014083862305, "logits_per_token": -3.3684478759765626, "logits_per_char": -0.6477784376878005, "num_chars": 26}, {"sum_logits": -13.831155776977539, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -26.128908157348633, "logits_per_token": -1.7288944721221924, "logits_per_char": -0.3457788944244385, "num_chars": 40}, {"sum_logits": -28.234045028686523, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -39.343685150146484, "logits_per_token": -3.5292556285858154, "logits_per_char": -0.553608726052677, "num_chars": 51}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 238, "native_id": "Mercury_414504", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.666301727294922, "incorrect_loss_raw": 29.180647532145183, "correct_loss_per_char": 0.5347146193186442, "incorrect_loss_per_char": 0.618752531598989, "correct_loss_per_token": 2.1388584772745767, "incorrect_loss_per_token": 2.967018507259749, "correct_loss_uncond": -23.045886993408203, "incorrect_loss_uncond": -20.324073791503906}, "model_output": [{"sum_logits": -40.13905334472656, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -61.49020004272461, "logits_per_token": -3.6490048495205967, "logits_per_char": -0.7298009699041194, "num_chars": 55}, {"sum_logits": -25.666301727294922, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -48.712188720703125, "logits_per_token": -2.1388584772745767, "logits_per_char": -0.5347146193186442, "num_chars": 48}, {"sum_logits": -29.255970001220703, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -45.324432373046875, "logits_per_token": -2.6596336364746094, "logits_per_char": -0.635999347852624, "num_chars": 46}, {"sum_logits": -18.14691925048828, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -41.69953155517578, "logits_per_token": -2.5924170357840404, "logits_per_char": -0.4904572770402238, "num_chars": 37}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 239, "native_id": "TIMSS_2011_4_pg27", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.33385968208313, "incorrect_loss_raw": 4.774671713511149, "correct_loss_per_char": 0.1852144267823961, "incorrect_loss_per_char": 0.26525953963950827, "correct_loss_per_token": 0.666771936416626, "incorrect_loss_per_token": 0.9549343427022299, "correct_loss_uncond": -18.58739924430847, "incorrect_loss_uncond": -19.459559599558514}, "model_output": [{"sum_logits": -3.33385968208313, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -21.9212589263916, "logits_per_token": -0.666771936416626, "logits_per_char": -0.1852144267823961, "num_chars": 18}, {"sum_logits": -3.9439802169799805, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -24.881118774414062, "logits_per_token": -0.7887960433959961, "logits_per_char": -0.21911001205444336, "num_chars": 18}, {"sum_logits": -5.151782035827637, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -24.2700138092041, "logits_per_token": -1.0303564071655273, "logits_per_char": -0.2862101131015354, "num_chars": 18}, {"sum_logits": -5.22825288772583, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.55156135559082, "logits_per_token": -1.045650577545166, "logits_per_char": -0.2904584937625461, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 240, "native_id": "Mercury_SC_402029", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 17.387039184570312, "incorrect_loss_raw": 15.292510032653809, "correct_loss_per_char": 0.42407412645293446, "incorrect_loss_per_char": 0.8344103132772167, "correct_loss_per_token": 1.9318932427300348, "incorrect_loss_per_token": 3.900920825534397, "correct_loss_uncond": -15.958480834960938, "incorrect_loss_uncond": -8.358180046081543}, "model_output": [{"sum_logits": -15.568090438842773, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -25.129947662353516, "logits_per_token": -3.113618087768555, "logits_per_char": -0.819373180991725, "num_chars": 19}, {"sum_logits": -18.168025970458984, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.980133056640625, "logits_per_token": -4.542006492614746, "logits_per_char": -1.0093347761366103, "num_chars": 18}, {"sum_logits": -12.141413688659668, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -18.841989517211914, "logits_per_token": -4.04713789621989, "logits_per_char": -0.6745229827033149, "num_chars": 18}, {"sum_logits": -17.387039184570312, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -33.34552001953125, "logits_per_token": -1.9318932427300348, "logits_per_char": -0.42407412645293446, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 241, "native_id": "Mercury_7131845", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.696340560913086, "incorrect_loss_raw": 19.73593266805013, "correct_loss_per_char": 0.6381276635562673, "incorrect_loss_per_char": 0.586267898029306, "correct_loss_per_token": 3.616056760152181, "incorrect_loss_per_token": 3.11287896595304, "correct_loss_uncond": -8.043298721313477, "incorrect_loss_uncond": -10.967840194702148}, "model_output": [{"sum_logits": -18.09550666809082, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.529226303100586, "logits_per_token": -3.0159177780151367, "logits_per_char": -0.5654845833778381, "num_chars": 32}, {"sum_logits": -22.231836318969727, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -30.878032684326172, "logits_per_token": -3.1759766169956754, "logits_per_char": -0.6538775387932273, "num_chars": 34}, {"sum_logits": -21.696340560913086, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -29.739639282226562, "logits_per_token": -3.616056760152181, "logits_per_char": -0.6381276635562673, "num_chars": 34}, {"sum_logits": -18.880455017089844, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -32.70405960083008, "logits_per_token": -3.146742502848307, "logits_per_char": -0.5394415719168527, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 242, "native_id": "Mercury_SC_405533", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.488992691040039, "incorrect_loss_raw": 17.667311986287434, "correct_loss_per_char": 0.4840310215950012, "incorrect_loss_per_char": 0.4376880938979401, "correct_loss_per_token": 2.5814987818400064, "incorrect_loss_per_token": 2.2762564557971374, "correct_loss_uncond": -12.26626205444336, "incorrect_loss_uncond": -16.227778752644856}, "model_output": [{"sum_logits": -12.999065399169922, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -28.226009368896484, "logits_per_token": -2.5998130798339845, "logits_per_char": -0.44824363445413523, "num_chars": 29}, {"sum_logits": -15.488992691040039, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.7552547454834, "logits_per_token": -2.5814987818400064, "logits_per_char": -0.4840310215950012, "num_chars": 32}, {"sum_logits": -22.6278076171875, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -39.45140838623047, "logits_per_token": -2.0570734197443183, "logits_per_char": -0.5028401692708333, "num_chars": 45}, {"sum_logits": -17.375062942504883, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -34.00785446166992, "logits_per_token": -2.1718828678131104, "logits_per_char": -0.36198047796885174, "num_chars": 48}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 243, "native_id": "Mercury_7086748", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.165712356567383, "incorrect_loss_raw": 10.478276252746582, "correct_loss_per_char": 0.7961902618408203, "incorrect_loss_per_char": 0.849152866591755, "correct_loss_per_token": 3.5828561782836914, "incorrect_loss_per_token": 5.239138126373291, "correct_loss_uncond": -11.596904754638672, "incorrect_loss_uncond": -6.815045992533366}, "model_output": [{"sum_logits": -7.165712356567383, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.762617111206055, "logits_per_token": -3.5828561782836914, "logits_per_char": -0.7961902618408203, "num_chars": 9}, {"sum_logits": -8.741926193237305, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.336740493774414, "logits_per_token": -4.370963096618652, "logits_per_char": -0.728493849436442, "num_chars": 12}, {"sum_logits": -11.443670272827148, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.507673263549805, "logits_per_token": -5.721835136413574, "logits_per_char": -0.9536391894022623, "num_chars": 12}, {"sum_logits": -11.249232292175293, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.035552978515625, "logits_per_token": -5.6246161460876465, "logits_per_char": -0.865325560936561, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 244, "native_id": "MDSA_2007_8_17", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.091751098632812, "incorrect_loss_raw": 16.18642520904541, "correct_loss_per_char": 0.19822542784643954, "incorrect_loss_per_char": 0.2942456649792347, "correct_loss_per_token": 0.8636965070452008, "incorrect_loss_per_token": 1.4710415305283966, "correct_loss_uncond": -28.581802368164062, "incorrect_loss_uncond": -25.041254997253418}, "model_output": [{"sum_logits": -25.284297943115234, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -41.93368911743164, "logits_per_token": -2.8093664381239147, "logits_per_char": -0.5160060804717395, "num_chars": 49}, {"sum_logits": -12.091751098632812, "num_tokens": 14, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -40.673553466796875, "logits_per_token": -0.8636965070452008, "logits_per_char": -0.19822542784643954, "num_chars": 61}, {"sum_logits": -12.335453033447266, "num_tokens": 15, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -41.97407913208008, "logits_per_token": -0.822363535563151, "logits_per_char": -0.19580084180075025, "num_chars": 63}, {"sum_logits": -10.93952465057373, "num_tokens": 14, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -39.775272369384766, "logits_per_token": -0.7813946178981236, "logits_per_char": -0.17093007266521454, "num_chars": 64}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 245, "native_id": "Mercury_7210473", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.816377639770508, "incorrect_loss_raw": 11.05610434214274, "correct_loss_per_char": 0.7385236024856567, "incorrect_loss_per_char": 0.6778397080945034, "correct_loss_per_token": 5.908188819885254, "incorrect_loss_per_token": 5.52805217107137, "correct_loss_uncond": -8.29609489440918, "incorrect_loss_uncond": -7.555434226989746}, "model_output": [{"sum_logits": -11.260743141174316, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -18.667686462402344, "logits_per_token": -5.630371570587158, "logits_per_char": -0.7037964463233948, "num_chars": 16}, {"sum_logits": -10.744119644165039, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -18.157917022705078, "logits_per_token": -5.3720598220825195, "logits_per_char": -0.6320070378920611, "num_chars": 17}, {"sum_logits": -11.816377639770508, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -20.112472534179688, "logits_per_token": -5.908188819885254, "logits_per_char": -0.7385236024856567, "num_chars": 16}, {"sum_logits": -11.163450241088867, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -19.00901222229004, "logits_per_token": -5.581725120544434, "logits_per_char": -0.6977156400680542, "num_chars": 16}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 246, "native_id": "Mercury_7214340", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.609471321105957, "incorrect_loss_raw": 5.155653079350789, "correct_loss_per_char": 0.4609471321105957, "incorrect_loss_per_char": 0.5048552394314647, "correct_loss_per_token": 4.609471321105957, "incorrect_loss_per_token": 5.155653079350789, "correct_loss_uncond": -9.464098930358887, "incorrect_loss_uncond": -8.591827313105265}, "model_output": [{"sum_logits": -4.609471321105957, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -14.073570251464844, "logits_per_token": -4.609471321105957, "logits_per_char": -0.4609471321105957, "num_chars": 10}, {"sum_logits": -6.4333672523498535, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -12.953004837036133, "logits_per_token": -6.4333672523498535, "logits_per_char": -0.5848515683954413, "num_chars": 11}, {"sum_logits": -2.37194561958313, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -13.1925048828125, "logits_per_token": -2.37194561958313, "logits_per_char": -0.2635495132870144, "num_chars": 9}, {"sum_logits": -6.661646366119385, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.096931457519531, "logits_per_token": -6.661646366119385, "logits_per_char": -0.6661646366119385, "num_chars": 10}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 247, "native_id": "MCAS_2005_9_17", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.1337554454803467, "incorrect_loss_raw": 4.77424685160319, "correct_loss_per_char": 0.08721195734464206, "incorrect_loss_per_char": 0.7079813805493441, "correct_loss_per_token": 1.1337554454803467, "incorrect_loss_per_token": 4.77424685160319, "correct_loss_uncond": -15.050801515579224, "incorrect_loss_uncond": -8.64138094584147}, "model_output": [{"sum_logits": -4.091157913208008, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -12.978740692138672, "logits_per_token": -4.091157913208008, "logits_per_char": -0.2556973695755005, "num_chars": 16}, {"sum_logits": -8.599276542663574, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -11.723394393920898, "logits_per_token": -8.599276542663574, "logits_per_char": -1.7198553085327148, "num_chars": 5}, {"sum_logits": -1.1337554454803467, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": true, "sum_logits_uncond": -16.18455696105957, "logits_per_token": -1.1337554454803467, "logits_per_char": -0.08721195734464206, "num_chars": 13}, {"sum_logits": -1.6323060989379883, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -15.544748306274414, "logits_per_token": -1.6323060989379883, "logits_per_char": -0.1483914635398171, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 248, "native_id": "MEA_2016_8_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.418556213378906, "incorrect_loss_raw": 7.70932149887085, "correct_loss_per_char": 0.8014274010291467, "incorrect_loss_per_char": 0.6398042634681419, "correct_loss_per_token": 5.209278106689453, "incorrect_loss_per_token": 3.854660749435425, "correct_loss_uncond": -6.7380828857421875, "incorrect_loss_uncond": -8.6586963335673}, "model_output": [{"sum_logits": -6.323904037475586, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.639188766479492, "logits_per_token": -3.161952018737793, "logits_per_char": -0.351328002081977, "num_chars": 18}, {"sum_logits": -10.418556213378906, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -17.156639099121094, "logits_per_token": -5.209278106689453, "logits_per_char": -0.8014274010291467, "num_chars": 13}, {"sum_logits": -6.7392754554748535, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.062490463256836, "logits_per_token": -3.3696377277374268, "logits_per_char": -0.5616062879562378, "num_chars": 12}, {"sum_logits": -10.06478500366211, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.402374267578125, "logits_per_token": -5.032392501831055, "logits_per_char": -1.006478500366211, "num_chars": 10}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 249, "native_id": "Mercury_SC_401278", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.410508155822754, "incorrect_loss_raw": 12.976037979125977, "correct_loss_per_char": 0.7131567597389221, "incorrect_loss_per_char": 0.8329692981861255, "correct_loss_per_token": 3.8035027186075845, "incorrect_loss_per_token": 4.325345993041992, "correct_loss_uncond": -5.708207130432129, "incorrect_loss_uncond": -6.89370854695638}, "model_output": [{"sum_logits": -7.5753936767578125, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -18.37681770324707, "logits_per_token": -2.5251312255859375, "logits_per_char": -0.5410995483398438, "num_chars": 14}, {"sum_logits": -11.410508155822754, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.118715286254883, "logits_per_token": -3.8035027186075845, "logits_per_char": -0.7131567597389221, "num_chars": 16}, {"sum_logits": -11.913570404052734, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.6166934967041, "logits_per_token": -3.9711901346842446, "logits_per_char": -0.6618650224473741, "num_chars": 18}, {"sum_logits": -19.439149856567383, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -23.6157283782959, "logits_per_token": -6.479716618855794, "logits_per_char": -1.2959433237711588, "num_chars": 15}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 250, "native_id": "Mercury_SC_407689", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.913349628448486, "incorrect_loss_raw": 6.599944909413655, "correct_loss_per_char": 1.1826699256896973, "incorrect_loss_per_char": 0.6737378853740115, "correct_loss_per_token": 5.913349628448486, "incorrect_loss_per_token": 4.545587380727132, "correct_loss_uncond": -6.888720989227295, "incorrect_loss_uncond": -7.163328329722087}, "model_output": [{"sum_logits": -6.53108024597168, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -12.92275619506836, "logits_per_token": -6.53108024597168, "logits_per_char": -0.5937345678156073, "num_chars": 11}, {"sum_logits": -9.244608879089355, "num_tokens": 3, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.605838775634766, "logits_per_token": -3.081536293029785, "logits_per_char": -0.9244608879089355, "num_chars": 10}, {"sum_logits": -4.024145603179932, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -12.761224746704102, "logits_per_token": -4.024145603179932, "logits_per_char": -0.5030182003974915, "num_chars": 8}, {"sum_logits": -5.913349628448486, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -12.802070617675781, "logits_per_token": -5.913349628448486, "logits_per_char": -1.1826699256896973, "num_chars": 5}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 251, "native_id": "Mercury_7230405", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.963314056396484, "incorrect_loss_raw": 27.434979756673176, "correct_loss_per_char": 0.9156476126776801, "incorrect_loss_per_char": 0.6024268673256699, "correct_loss_per_token": 4.709044865199497, "incorrect_loss_per_token": 3.397280190987562, "correct_loss_uncond": -14.523242950439453, "incorrect_loss_uncond": -15.199605305989584}, "model_output": [{"sum_logits": -26.505741119384766, "num_tokens": 8, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -41.95624542236328, "logits_per_token": -3.3132176399230957, "logits_per_char": -0.6464814907167016, "num_chars": 41}, {"sum_logits": -32.963314056396484, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -47.48655700683594, "logits_per_token": -4.709044865199497, "logits_per_char": -0.9156476126776801, "num_chars": 36}, {"sum_logits": -21.37942886352539, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -37.52967834472656, "logits_per_token": -3.05420412336077, "logits_per_char": -0.39591534932454425, "num_chars": 54}, {"sum_logits": -34.419769287109375, "num_tokens": 9, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -48.41783142089844, "logits_per_token": -3.8244188096788196, "logits_per_char": -0.7648837619357639, "num_chars": 45}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 252, "native_id": "Mercury_SC_405640", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.776290893554688, "incorrect_loss_raw": 14.196840286254883, "correct_loss_per_char": 0.5431272718641493, "incorrect_loss_per_char": 0.6276401562665506, "correct_loss_per_token": 1.9552581787109375, "incorrect_loss_per_token": 2.8393680572509763, "correct_loss_uncond": -17.938644409179688, "incorrect_loss_uncond": -13.0187136332194}, "model_output": [{"sum_logits": -9.776290893554688, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -27.714935302734375, "logits_per_token": -1.9552581787109375, "logits_per_char": -0.5431272718641493, "num_chars": 18}, {"sum_logits": -12.460291862487793, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -24.208890914916992, "logits_per_token": -2.4920583724975587, "logits_per_char": -0.6230145931243897, "num_chars": 20}, {"sum_logits": -13.605303764343262, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -27.013534545898438, "logits_per_token": -2.7210607528686523, "logits_per_char": -0.6478716078258696, "num_chars": 21}, {"sum_logits": -16.524925231933594, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -30.424236297607422, "logits_per_token": -3.3049850463867188, "logits_per_char": -0.6120342678493924, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 253, "native_id": "Mercury_7201775", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.8844847679138184, "incorrect_loss_raw": 6.8301981290181475, "correct_loss_per_char": 0.4855605959892273, "incorrect_loss_per_char": 0.5733320664136837, "correct_loss_per_token": 3.8844847679138184, "incorrect_loss_per_token": 5.968787511189778, "correct_loss_uncond": -8.80226182937622, "incorrect_loss_uncond": -9.014098008473715}, "model_output": [{"sum_logits": -9.090943336486816, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.670759201049805, "logits_per_token": -9.090943336486816, "logits_per_char": -0.6993033335759089, "num_chars": 13}, {"sum_logits": -6.231187343597412, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.407533645629883, "logits_per_token": -6.231187343597412, "logits_per_char": -0.6231187343597412, "num_chars": 10}, {"sum_logits": -5.168463706970215, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.4545955657959, "logits_per_token": -2.5842318534851074, "logits_per_char": -0.39757413130540115, "num_chars": 13}, {"sum_logits": -3.8844847679138184, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.686746597290039, "logits_per_token": -3.8844847679138184, "logits_per_char": -0.4855605959892273, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 254, "native_id": "Mercury_7177398", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.622888565063477, "incorrect_loss_raw": 20.07775115966797, "correct_loss_per_char": 0.6975125343568863, "incorrect_loss_per_char": 0.5712369983065061, "correct_loss_per_token": 2.7028610706329346, "incorrect_loss_per_token": 2.6223675977616083, "correct_loss_uncond": -8.667409896850586, "incorrect_loss_uncond": -9.017616907755533}, "model_output": [{"sum_logits": -25.323530197143555, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -33.12535858154297, "logits_per_token": -3.1654412746429443, "logits_per_char": -0.6330882549285889, "num_chars": 40}, {"sum_logits": -15.9847412109375, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.232275009155273, "logits_per_token": -1.9980926513671875, "logits_per_char": -0.4701394473805147, "num_chars": 34}, {"sum_logits": -18.92498207092285, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.928470611572266, "logits_per_token": -2.703568867274693, "logits_per_char": -0.6104832926104146, "num_chars": 31}, {"sum_logits": -21.622888565063477, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.290298461914062, "logits_per_token": -2.7028610706329346, "logits_per_char": -0.6975125343568863, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 255, "native_id": "Mercury_7041423", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.442688941955566, "incorrect_loss_raw": 11.794963836669922, "correct_loss_per_char": 0.7319228789385628, "incorrect_loss_per_char": 0.6544632980246949, "correct_loss_per_token": 6.221344470977783, "incorrect_loss_per_token": 4.451147079467774, "correct_loss_uncond": -10.690032005310059, "incorrect_loss_uncond": -10.318398157755533}, "model_output": [{"sum_logits": -12.442688941955566, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -23.132720947265625, "logits_per_token": -6.221344470977783, "logits_per_char": -0.7319228789385628, "num_chars": 17}, {"sum_logits": -16.115554809570312, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -24.065841674804688, "logits_per_token": -5.3718516031901045, "logits_per_char": -0.9479738123276654, "num_chars": 17}, {"sum_logits": -9.35086441040039, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -18.450687408447266, "logits_per_token": -4.675432205200195, "logits_per_char": -0.5194924672444662, "num_chars": 18}, {"sum_logits": -9.918472290039062, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.823556900024414, "logits_per_token": -3.306157430013021, "logits_per_char": -0.4959236145019531, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 256, "native_id": "Mercury_7004743", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.586994171142578, "incorrect_loss_raw": 7.060503005981445, "correct_loss_per_char": 0.352845705472506, "incorrect_loss_per_char": 0.6587602326364229, "correct_loss_per_token": 1.5289980570475261, "incorrect_loss_per_token": 2.353501001993815, "correct_loss_uncond": -11.248590469360352, "incorrect_loss_uncond": -10.327827453613281}, "model_output": [{"sum_logits": -5.575786590576172, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.865407943725586, "logits_per_token": -1.8585955301920574, "logits_per_char": -0.5575786590576172, "num_chars": 10}, {"sum_logits": -6.402153015136719, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.55327606201172, "logits_per_token": -2.1340510050455728, "logits_per_char": -0.5820139104669745, "num_chars": 11}, {"sum_logits": -9.203569412231445, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.746307373046875, "logits_per_token": -3.067856470743815, "logits_per_char": -0.8366881283846769, "num_chars": 11}, {"sum_logits": -4.586994171142578, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.83558464050293, "logits_per_token": -1.5289980570475261, "logits_per_char": -0.352845705472506, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 257, "native_id": "Mercury_7198468", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 31.300148010253906, "incorrect_loss_raw": 27.10522524515788, "correct_loss_per_char": 0.4968277461945064, "incorrect_loss_per_char": 0.5135167848995255, "correct_loss_per_token": 3.1300148010253905, "incorrect_loss_per_token": 3.2921216752794056, "correct_loss_uncond": -14.242019653320312, "incorrect_loss_uncond": -10.286677678426107}, "model_output": [{"sum_logits": -31.300148010253906, "num_tokens": 10, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -45.54216766357422, "logits_per_token": -3.1300148010253905, "logits_per_char": -0.4968277461945064, "num_chars": 63}, {"sum_logits": -34.6783447265625, "num_tokens": 8, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -42.18860626220703, "logits_per_token": -4.3347930908203125, "logits_per_char": -0.6192561558314732, "num_chars": 56}, {"sum_logits": -25.89453125, "num_tokens": 8, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -33.398284912109375, "logits_per_token": -3.23681640625, "logits_per_char": -0.4979717548076923, "num_chars": 52}, {"sum_logits": -20.742799758911133, "num_tokens": 9, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -36.58881759643555, "logits_per_token": -2.304755528767904, "logits_per_char": -0.4233224440594109, "num_chars": 49}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 258, "native_id": "MEA_2014_5_11", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 36.54867172241211, "incorrect_loss_raw": 22.195308049519856, "correct_loss_per_char": 0.6768272541187428, "incorrect_loss_per_char": 0.4853890422128031, "correct_loss_per_token": 4.568583965301514, "incorrect_loss_per_token": 2.559514872233073, "correct_loss_uncond": -6.526710510253906, "incorrect_loss_uncond": -10.895825068155924}, "model_output": [{"sum_logits": -16.035057067871094, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -27.527360916137695, "logits_per_token": -2.0043821334838867, "logits_per_char": -0.34117142697598074, "num_chars": 47}, {"sum_logits": -24.763031005859375, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.08723258972168, "logits_per_token": -3.095378875732422, "logits_per_char": -0.6190757751464844, "num_chars": 40}, {"sum_logits": -36.54867172241211, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -43.075382232666016, "logits_per_token": -4.568583965301514, "logits_per_char": -0.6768272541187428, "num_chars": 54}, {"sum_logits": -25.7878360748291, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -43.65880584716797, "logits_per_token": -2.5787836074829102, "logits_per_char": -0.49591992451594424, "num_chars": 52}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 259, "native_id": "Mercury_410602", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.293524265289307, "incorrect_loss_raw": 4.541696071624756, "correct_loss_per_char": 0.48122947866266425, "incorrect_loss_per_char": 0.5122750184752725, "correct_loss_per_token": 2.6467621326446533, "incorrect_loss_per_token": 2.270848035812378, "correct_loss_uncond": -7.4849019050598145, "incorrect_loss_uncond": -8.415733814239502}, "model_output": [{"sum_logits": -5.293524265289307, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.778426170349121, "logits_per_token": -2.6467621326446533, "logits_per_char": -0.48122947866266425, "num_chars": 11}, {"sum_logits": -4.87845516204834, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -13.70181941986084, "logits_per_token": -2.43922758102417, "logits_per_char": -0.44349592382257635, "num_chars": 11}, {"sum_logits": -3.7213802337646484, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.532236099243164, "logits_per_token": -1.8606901168823242, "logits_per_char": -0.46517252922058105, "num_chars": 8}, {"sum_logits": -5.025252819061279, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.63823413848877, "logits_per_token": -2.5126264095306396, "logits_per_char": -0.6281566023826599, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 260, "native_id": "Mercury_7108868", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.344710350036621, "incorrect_loss_raw": 16.72248872121175, "correct_loss_per_char": 0.6671613195668096, "incorrect_loss_per_char": 0.7694302900337879, "correct_loss_per_token": 3.068942070007324, "incorrect_loss_per_token": 5.2096539868248835, "correct_loss_uncond": -9.144797325134277, "incorrect_loss_uncond": -3.816342353820801}, "model_output": [{"sum_logits": -12.590875625610352, "num_tokens": 3, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -15.20457649230957, "logits_per_token": -4.196958541870117, "logits_per_char": -0.5995655059814453, "num_chars": 21}, {"sum_logits": -13.122321128845215, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.457983016967773, "logits_per_token": -3.2805802822113037, "logits_per_char": -0.4860118936609339, "num_chars": 27}, {"sum_logits": -15.344710350036621, "num_tokens": 5, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -24.4895076751709, "logits_per_token": -3.068942070007324, "logits_per_char": -0.6671613195668096, "num_chars": 23}, {"sum_logits": -24.454269409179688, "num_tokens": 3, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -27.953933715820312, "logits_per_token": -8.151423136393229, "logits_per_char": -1.2227134704589844, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 261, "native_id": "Mercury_7033828", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.707865595817566, "incorrect_loss_raw": 4.252469539642334, "correct_loss_per_char": 0.3415731191635132, "incorrect_loss_per_char": 1.023613444964091, "correct_loss_per_token": 1.707865595817566, "incorrect_loss_per_token": 4.252469539642334, "correct_loss_uncond": -11.094205021858215, "incorrect_loss_uncond": -7.7891316413879395}, "model_output": [{"sum_logits": -4.859842300415039, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -11.566265106201172, "logits_per_token": -4.859842300415039, "logits_per_char": -1.2149605751037598, "num_chars": 4}, {"sum_logits": -5.527329921722412, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -12.094873428344727, "logits_per_token": -5.527329921722412, "logits_per_char": -1.381832480430603, "num_chars": 4}, {"sum_logits": -2.370236396789551, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -2.370236396789551, "logits_per_char": -0.47404727935791013, "num_chars": 5}, {"sum_logits": -1.707865595817566, "num_tokens": 1, "num_tokens_all": 189, "is_greedy": true, "sum_logits_uncond": -12.802070617675781, "logits_per_token": -1.707865595817566, "logits_per_char": -0.3415731191635132, "num_chars": 5}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 262, "native_id": "TIMSS_2007_4_pg19", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.351455688476562, "incorrect_loss_raw": 17.97075843811035, "correct_loss_per_char": 0.4542071024576823, "incorrect_loss_per_char": 0.6950150963400498, "correct_loss_per_token": 2.3359222412109375, "incorrect_loss_per_token": 3.8956723389802157, "correct_loss_uncond": -17.376556396484375, "incorrect_loss_uncond": -8.30851682027181}, "model_output": [{"sum_logits": -16.351455688476562, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -33.72801208496094, "logits_per_token": -2.3359222412109375, "logits_per_char": -0.4542071024576823, "num_chars": 36}, {"sum_logits": -28.276836395263672, "num_tokens": 9, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -38.297245025634766, "logits_per_token": -3.1418707105848522, "logits_per_char": -0.5770782937808913, "num_chars": 49}, {"sum_logits": -11.797046661376953, "num_tokens": 3, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -21.07012176513672, "logits_per_token": -3.932348887125651, "logits_per_char": -0.6939439212574678, "num_chars": 17}, {"sum_logits": -13.83839225769043, "num_tokens": 3, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.470458984375, "logits_per_token": -4.6127974192301435, "logits_per_char": -0.81402307398179, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 263, "native_id": "Mercury_400828", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.370744705200195, "incorrect_loss_raw": 13.415415128072103, "correct_loss_per_char": 0.7407674789428711, "incorrect_loss_per_char": 1.0850594157264346, "correct_loss_per_token": 2.074148941040039, "incorrect_loss_per_token": 2.6830830256144207, "correct_loss_uncond": -21.166400909423828, "incorrect_loss_uncond": -17.012098630269367}, "model_output": [{"sum_logits": -14.56589126586914, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -30.891130447387695, "logits_per_token": -2.913178253173828, "logits_per_char": -1.2138242721557617, "num_chars": 12}, {"sum_logits": -17.39160919189453, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.64093017578125, "logits_per_token": -3.478321838378906, "logits_per_char": -1.449300765991211, "num_chars": 12}, {"sum_logits": -8.288744926452637, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.75048065185547, "logits_per_token": -1.6577489852905274, "logits_per_char": -0.5920532090323312, "num_chars": 14}, {"sum_logits": -10.370744705200195, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.537145614624023, "logits_per_token": -2.074148941040039, "logits_per_char": -0.7407674789428711, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 264, "native_id": "VASoL_2008_3_16", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.40003204345703, "incorrect_loss_raw": 17.915453275044758, "correct_loss_per_char": 0.5411774130428538, "incorrect_loss_per_char": 0.5950741002461813, "correct_loss_per_token": 2.628576006208147, "incorrect_loss_per_token": 3.119352549598331, "correct_loss_uncond": -11.77764892578125, "incorrect_loss_uncond": -6.108341534932454}, "model_output": [{"sum_logits": -24.346250534057617, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -26.855405807495117, "logits_per_token": -3.4780357905796597, "logits_per_char": -0.6580067711907465, "num_chars": 37}, {"sum_logits": -18.40003204345703, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -30.17768096923828, "logits_per_token": -2.628576006208147, "logits_per_char": -0.5411774130428538, "num_chars": 34}, {"sum_logits": -16.46623420715332, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.406272888183594, "logits_per_token": -3.293246841430664, "logits_per_char": -0.6098605261908637, "num_chars": 27}, {"sum_logits": -12.93387508392334, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -18.80970573425293, "logits_per_token": -2.586775016784668, "logits_per_char": -0.5173550033569336, "num_chars": 25}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 265, "native_id": "LEAP__5_10315", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.581401824951172, "incorrect_loss_raw": 13.061232884724935, "correct_loss_per_char": 0.25176960489024286, "incorrect_loss_per_char": 0.2949989947646799, "correct_loss_per_token": 1.1581401824951172, "incorrect_loss_per_token": 1.3061232884724934, "correct_loss_uncond": -21.39691162109375, "incorrect_loss_uncond": -19.417207717895508}, "model_output": [{"sum_logits": -14.503652572631836, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -33.458763122558594, "logits_per_token": -1.4503652572631835, "logits_per_char": -0.337294245875159, "num_chars": 43}, {"sum_logits": -11.314157485961914, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -30.85367202758789, "logits_per_token": -1.1314157485961913, "logits_per_char": -0.25713994286277075, "num_chars": 44}, {"sum_logits": -11.581401824951172, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -32.97831344604492, "logits_per_token": -1.1581401824951172, "logits_per_char": -0.25176960489024286, "num_chars": 46}, {"sum_logits": -13.365888595581055, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -33.122886657714844, "logits_per_token": -1.3365888595581055, "logits_per_char": -0.29056279555610987, "num_chars": 46}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 266, "native_id": "Mercury_SC_415471", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 17.0814208984375, "incorrect_loss_raw": 14.666481018066406, "correct_loss_per_char": 0.5023947323069853, "incorrect_loss_per_char": 0.6255355953111332, "correct_loss_per_token": 2.8469034830729165, "incorrect_loss_per_token": 3.3529582553439674, "correct_loss_uncond": -14.096813201904297, "incorrect_loss_uncond": -6.516590118408203}, "model_output": [{"sum_logits": -17.0814208984375, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.178234100341797, "logits_per_token": -2.8469034830729165, "logits_per_char": -0.5023947323069853, "num_chars": 34}, {"sum_logits": -11.291831970214844, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.96083641052246, "logits_per_token": -1.8819719950358074, "logits_per_char": -0.3642526442004788, "num_chars": 31}, {"sum_logits": -18.864049911499023, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -21.993059158325195, "logits_per_token": -4.716012477874756, "logits_per_char": -0.8201760831086532, "num_chars": 23}, {"sum_logits": -13.843561172485352, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.595317840576172, "logits_per_token": -3.460890293121338, "logits_per_char": -0.6921780586242676, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 267, "native_id": "Mercury_7247065", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 35.065425872802734, "incorrect_loss_raw": 31.982643763224285, "correct_loss_per_char": 0.8348910922095889, "incorrect_loss_per_char": 0.7533681659063297, "correct_loss_per_token": 3.5065425872802733, "incorrect_loss_per_token": 4.33536994015729, "correct_loss_uncond": -7.330295562744141, "incorrect_loss_uncond": -6.59431266784668}, "model_output": [{"sum_logits": -28.907567977905273, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -36.12263107299805, "logits_per_token": -3.2119519975450306, "logits_per_char": -0.8029879993862576, "num_chars": 36}, {"sum_logits": -33.938697814941406, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -36.88091278076172, "logits_per_token": -5.656449635823567, "logits_per_char": -0.8080642336890811, "num_chars": 42}, {"sum_logits": -35.065425872802734, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -42.395721435546875, "logits_per_token": -3.5065425872802733, "logits_per_char": -0.8348910922095889, "num_chars": 42}, {"sum_logits": -33.10166549682617, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -42.727325439453125, "logits_per_token": -4.1377081871032715, "logits_per_char": -0.6490522646436504, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 268, "native_id": "MDSA_2011_5_3", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.4795937538146973, "incorrect_loss_raw": 3.9740312099456787, "correct_loss_per_char": 0.20663281281789145, "incorrect_loss_per_char": 0.453304744853593, "correct_loss_per_token": 2.4795937538146973, "incorrect_loss_per_token": 3.9740312099456787, "correct_loss_uncond": -11.777806758880615, "incorrect_loss_uncond": -10.639859120051065}, "model_output": [{"sum_logits": -1.7443959712982178, "num_tokens": 1, "num_tokens_all": 229, "is_greedy": true, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -1.7443959712982178, "logits_per_char": -0.1341843054844783, "num_chars": 13}, {"sum_logits": -2.4795937538146973, "num_tokens": 1, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -2.4795937538146973, "logits_per_char": -0.20663281281789145, "num_chars": 12}, {"sum_logits": -3.346724033355713, "num_tokens": 1, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -14.053781509399414, "logits_per_token": -3.346724033355713, "logits_per_char": -0.37185822592841256, "num_chars": 9}, {"sum_logits": -6.8309736251831055, "num_tokens": 1, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -13.966508865356445, "logits_per_token": -6.8309736251831055, "logits_per_char": -0.8538717031478882, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 269, "native_id": "MDSA_2009_5_39", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.770933151245117, "incorrect_loss_raw": 14.82203483581543, "correct_loss_per_char": 0.3363123757498605, "incorrect_loss_per_char": 0.4578794901480348, "correct_loss_per_token": 2.3541866302490235, "incorrect_loss_per_token": 3.0702085706922744, "correct_loss_uncond": -18.278362274169922, "incorrect_loss_uncond": -14.731074651082357}, "model_output": [{"sum_logits": -7.43736457824707, "num_tokens": 5, "num_tokens_all": 310, "is_greedy": false, "sum_logits_uncond": -29.457141876220703, "logits_per_token": -1.487472915649414, "logits_per_char": -0.2754579473424841, "num_chars": 27}, {"sum_logits": -18.62035369873047, "num_tokens": 4, "num_tokens_all": 309, "is_greedy": false, "sum_logits_uncond": -25.223918914794922, "logits_per_token": -4.655088424682617, "logits_per_char": -0.6006565709267894, "num_chars": 31}, {"sum_logits": -11.770933151245117, "num_tokens": 5, "num_tokens_all": 310, "is_greedy": false, "sum_logits_uncond": -30.04929542541504, "logits_per_token": -2.3541866302490235, "logits_per_char": -0.3363123757498605, "num_chars": 35}, {"sum_logits": -18.40838623046875, "num_tokens": 6, "num_tokens_all": 311, "is_greedy": false, "sum_logits_uncond": -33.978267669677734, "logits_per_token": -3.0680643717447915, "logits_per_char": -0.4975239521748311, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 270, "native_id": "Mercury_187198", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.24164581298828, "incorrect_loss_raw": 19.7352720896403, "correct_loss_per_char": 0.4145828593860973, "incorrect_loss_per_char": 0.4727300071575337, "correct_loss_per_token": 2.026849534776476, "incorrect_loss_per_token": 2.5452242755385304, "correct_loss_uncond": -19.348628997802734, "incorrect_loss_uncond": -19.400227228800457}, "model_output": [{"sum_logits": -25.90247917175293, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -42.992095947265625, "logits_per_token": -2.878053241305881, "logits_per_char": -0.6317677846769008, "num_chars": 41}, {"sum_logits": -17.756298065185547, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -40.238765716552734, "logits_per_token": -2.536614009312221, "logits_per_char": -0.43308044061428164, "num_chars": 41}, {"sum_logits": -18.24164581298828, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -37.590274810791016, "logits_per_token": -2.026849534776476, "logits_per_char": -0.4145828593860973, "num_chars": 44}, {"sum_logits": -15.547039031982422, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -34.175636291503906, "logits_per_token": -2.2210055759974887, "logits_per_char": -0.3533417961814187, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 271, "native_id": "MCAS_2000_4_36", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.136621475219727, "incorrect_loss_raw": 5.759862184524536, "correct_loss_per_char": 1.1780517896016438, "incorrect_loss_per_char": 0.8114997502987977, "correct_loss_per_token": 7.068310737609863, "incorrect_loss_per_token": 5.759862184524536, "correct_loss_uncond": -3.414003372192383, "incorrect_loss_uncond": -9.113024473190308}, "model_output": [{"sum_logits": -14.136621475219727, "num_tokens": 2, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -17.55062484741211, "logits_per_token": -7.068310737609863, "logits_per_char": -1.1780517896016438, "num_chars": 12}, {"sum_logits": -2.8662564754486084, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -13.040176391601562, "logits_per_token": -2.8662564754486084, "logits_per_char": -0.5732512950897217, "num_chars": 5}, {"sum_logits": -8.182655334472656, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -16.272480010986328, "logits_per_token": -8.182655334472656, "logits_per_char": -1.1689507620675224, "num_chars": 7}, {"sum_logits": -6.230674743652344, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -15.30600357055664, "logits_per_token": -6.230674743652344, "logits_per_char": -0.6922971937391493, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 272, "native_id": "Mercury_184100", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.61871337890625, "incorrect_loss_raw": 17.917526563008625, "correct_loss_per_char": 0.8529875382133152, "incorrect_loss_per_char": 0.6609095502706791, "correct_loss_per_token": 3.92374267578125, "incorrect_loss_per_token": 3.189853332156227, "correct_loss_uncond": -16.41769790649414, "incorrect_loss_uncond": -11.656613985697428}, "model_output": [{"sum_logits": -18.575956344604492, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -30.191852569580078, "logits_per_token": -3.7151912689208983, "logits_per_char": -0.6879983831334997, "num_chars": 27}, {"sum_logits": -20.666728973388672, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -29.58702850341797, "logits_per_token": -2.9523898533412387, "logits_per_char": -0.8266691589355468, "num_chars": 25}, {"sum_logits": -19.61871337890625, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -36.03641128540039, "logits_per_token": -3.92374267578125, "logits_per_char": -0.8529875382133152, "num_chars": 23}, {"sum_logits": -14.509894371032715, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -28.943540573120117, "logits_per_token": -2.901978874206543, "logits_per_char": -0.4680611087429908, "num_chars": 31}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 273, "native_id": "Mercury_LBS10814", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.7958240509033203, "incorrect_loss_raw": 6.741434415181478, "correct_loss_per_char": 0.2232837677001953, "incorrect_loss_per_char": 0.6623942931493123, "correct_loss_per_token": 1.2652746836344402, "incorrect_loss_per_token": 3.370717207590739, "correct_loss_uncond": -12.109823226928711, "incorrect_loss_uncond": -7.15256913503011}, "model_output": [{"sum_logits": -8.066372871398926, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -14.266458511352539, "logits_per_token": -4.033186435699463, "logits_per_char": -0.576169490814209, "num_chars": 14}, {"sum_logits": -3.7958240509033203, "num_tokens": 3, "num_tokens_all": 200, "is_greedy": true, "sum_logits_uncond": -15.905647277832031, "logits_per_token": -1.2652746836344402, "logits_per_char": -0.2232837677001953, "num_chars": 17}, {"sum_logits": -7.80881404876709, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.950984001159668, "logits_per_token": -3.904407024383545, "logits_per_char": -0.9761017560958862, "num_chars": 8}, {"sum_logits": -4.349116325378418, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -14.464568138122559, "logits_per_token": -2.174558162689209, "logits_per_char": -0.4349116325378418, "num_chars": 10}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 274, "native_id": "Mercury_SC_408384", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.957054138183594, "incorrect_loss_raw": 22.958166758219402, "correct_loss_per_char": 0.5281486511230469, "incorrect_loss_per_char": 0.601377275063093, "correct_loss_per_token": 3.591410827636719, "incorrect_loss_per_token": 3.2013937291644865, "correct_loss_uncond": -11.779834747314453, "incorrect_loss_uncond": -11.205469767252604}, "model_output": [{"sum_logits": -13.161855697631836, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.725955963134766, "logits_per_token": -1.6452319622039795, "logits_per_char": -0.42457599024618825, "num_chars": 31}, {"sum_logits": -17.957054138183594, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -29.736888885498047, "logits_per_token": -3.591410827636719, "logits_per_char": -0.5281486511230469, "num_chars": 34}, {"sum_logits": -34.565711975097656, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -39.321868896484375, "logits_per_token": -4.937958853585379, "logits_per_char": -0.9096239993446752, "num_chars": 38}, {"sum_logits": -21.14693260192871, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.443084716796875, "logits_per_token": -3.0209903717041016, "logits_per_char": -0.4699318355984158, "num_chars": 45}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 275, "native_id": "Mercury_7043068", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.040142059326172, "incorrect_loss_raw": 16.54164218902588, "correct_loss_per_char": 0.4582897731236049, "incorrect_loss_per_char": 0.6306639500200216, "correct_loss_per_token": 2.2914488656180247, "incorrect_loss_per_token": 2.6481182204352485, "correct_loss_uncond": -17.91739273071289, "incorrect_loss_uncond": -22.103758811950684}, "model_output": [{"sum_logits": -19.01887321472168, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -26.042312622070312, "logits_per_token": -3.8037746429443358, "logits_per_char": -0.7924530506134033, "num_chars": 24}, {"sum_logits": -17.2877197265625, "num_tokens": 9, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -41.64700698852539, "logits_per_token": -1.9208577473958333, "logits_per_char": -0.6402859157986112, "num_chars": 27}, {"sum_logits": -13.318333625793457, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -48.246883392333984, "logits_per_token": -2.219722270965576, "logits_per_char": -0.4592528836480502, "num_chars": 29}, {"sum_logits": -16.040142059326172, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -33.95753479003906, "logits_per_token": -2.2914488656180247, "logits_per_char": -0.4582897731236049, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 276, "native_id": "Mercury_411071", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.867337226867676, "incorrect_loss_raw": 6.616627852121989, "correct_loss_per_char": 0.838191032409668, "incorrect_loss_per_char": 1.0375808420635406, "correct_loss_per_token": 1.466834306716919, "incorrect_loss_per_token": 2.623814026514689, "correct_loss_uncond": -9.462991714477539, "incorrect_loss_uncond": -9.533196290334066}, "model_output": [{"sum_logits": -8.213998794555664, "num_tokens": 4, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -17.502792358398438, "logits_per_token": -2.053499698638916, "logits_per_char": -1.1734283992222376, "num_chars": 7}, {"sum_logits": -5.867337226867676, "num_tokens": 4, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -15.330328941345215, "logits_per_token": -1.466834306716919, "logits_per_char": -0.838191032409668, "num_chars": 7}, {"sum_logits": -5.2159223556518555, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -14.871667861938477, "logits_per_token": -2.6079611778259277, "logits_per_char": -0.8693203926086426, "num_chars": 6}, {"sum_logits": -6.419962406158447, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -16.07501220703125, "logits_per_token": -3.2099812030792236, "logits_per_char": -1.0699937343597412, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 277, "native_id": "NYSEDREGENTS_2010_4_24", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.9632568359375, "incorrect_loss_raw": 19.451086044311523, "correct_loss_per_char": 0.5794598979334677, "incorrect_loss_per_char": 0.5653605061131077, "correct_loss_per_token": 2.9938761393229165, "incorrect_loss_per_token": 3.2418476740519204, "correct_loss_uncond": -15.421337127685547, "incorrect_loss_uncond": -15.716986974080404}, "model_output": [{"sum_logits": -22.375375747680664, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.639892578125, "logits_per_token": -3.729229291280111, "logits_per_char": -0.6047398850724504, "num_chars": 37}, {"sum_logits": -17.9632568359375, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.38459396362305, "logits_per_token": -2.9938761393229165, "logits_per_char": -0.5794598979334677, "num_chars": 31}, {"sum_logits": -19.425800323486328, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.902992248535156, "logits_per_token": -3.237633387247721, "logits_per_char": -0.5396055645412869, "num_chars": 36}, {"sum_logits": -16.552082061767578, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.961334228515625, "logits_per_token": -2.7586803436279297, "logits_per_char": -0.5517360687255859, "num_chars": 30}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 278, "native_id": "Mercury_SC_409673", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.279476165771484, "incorrect_loss_raw": 21.754804293314617, "correct_loss_per_char": 0.48062660580589656, "incorrect_loss_per_char": 0.47514347975647775, "correct_loss_per_token": 2.3291904742901144, "incorrect_loss_per_token": 2.1277177775347673, "correct_loss_uncond": -11.908939361572266, "incorrect_loss_uncond": -12.491202354431152}, "model_output": [{"sum_logits": -14.736878395080566, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -27.177541732788086, "logits_per_token": -1.6374309327867296, "logits_per_char": -0.3878125893442254, "num_chars": 38}, {"sum_logits": -19.263402938842773, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -32.9691162109375, "logits_per_token": -2.1403781043158636, "logits_per_char": -0.4586524509248279, "num_chars": 42}, {"sum_logits": -31.264131546020508, "num_tokens": 12, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -42.59136199951172, "logits_per_token": -2.605344295501709, "logits_per_char": -0.5789653990003798, "num_chars": 54}, {"sum_logits": -30.279476165771484, "num_tokens": 13, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -42.18841552734375, "logits_per_token": -2.3291904742901144, "logits_per_char": -0.48062660580589656, "num_chars": 63}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 279, "native_id": "Mercury_SC_400374", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.529057502746582, "incorrect_loss_raw": 16.751469294230144, "correct_loss_per_char": 0.606792500144557, "incorrect_loss_per_char": 0.6953236631271413, "correct_loss_per_token": 3.843019167582194, "incorrect_loss_per_token": 3.89599765141805, "correct_loss_uncond": -11.555647850036621, "incorrect_loss_uncond": -13.15896479288737}, "model_output": [{"sum_logits": -13.75483512878418, "num_tokens": 4, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -24.46047592163086, "logits_per_token": -3.438708782196045, "logits_per_char": -0.7641575071546767, "num_chars": 18}, {"sum_logits": -11.529057502746582, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.084705352783203, "logits_per_token": -3.843019167582194, "logits_per_char": -0.606792500144557, "num_chars": 19}, {"sum_logits": -18.98739242553711, "num_tokens": 4, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.820985794067383, "logits_per_token": -4.746848106384277, "logits_per_char": -0.7911413510640463, "num_chars": 24}, {"sum_logits": -17.51218032836914, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -33.4498405456543, "logits_per_token": -3.502436065673828, "logits_per_char": -0.5306721311627012, "num_chars": 33}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 280, "native_id": "CSZ_2009_8_CSZ20740", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.508875846862793, "incorrect_loss_raw": 6.535295486450195, "correct_loss_per_char": 1.3017751693725585, "incorrect_loss_per_char": 1.0404358667040627, "correct_loss_per_token": 3.2544379234313965, "incorrect_loss_per_token": 2.7554572423299155, "correct_loss_uncond": -11.421736717224121, "incorrect_loss_uncond": -12.64902114868164}, "model_output": [{"sum_logits": -6.1462860107421875, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.95322608947754, "logits_per_token": -1.5365715026855469, "logits_per_char": -0.8780408586774554, "num_chars": 7}, {"sum_logits": -6.508875846862793, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -17.930612564086914, "logits_per_token": -3.2544379234313965, "logits_per_char": -1.3017751693725585, "num_chars": 5}, {"sum_logits": -6.515571117401123, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -18.547122955322266, "logits_per_token": -3.2577855587005615, "logits_per_char": -1.0859285195668538, "num_chars": 6}, {"sum_logits": -6.944029331207275, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -19.052600860595703, "logits_per_token": -3.4720146656036377, "logits_per_char": -1.1573382218678792, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 281, "native_id": "Mercury_SC_406482", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.047228813171387, "incorrect_loss_raw": 18.665442148844402, "correct_loss_per_char": 0.3261807203292847, "incorrect_loss_per_char": 0.6926824221244225, "correct_loss_per_token": 2.1745381355285645, "incorrect_loss_per_token": 3.979196336534288, "correct_loss_uncond": -19.44688892364502, "incorrect_loss_uncond": -9.804557800292969}, "model_output": [{"sum_logits": -14.691934585571289, "num_tokens": 4, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -26.103588104248047, "logits_per_token": -3.6729836463928223, "logits_per_char": -0.6678152084350586, "num_chars": 22}, {"sum_logits": -16.56648063659668, "num_tokens": 4, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -26.714149475097656, "logits_per_token": -4.14162015914917, "logits_per_char": -0.6371723321767954, "num_chars": 26}, {"sum_logits": -24.737911224365234, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -32.592262268066406, "logits_per_token": -4.122985204060872, "logits_per_char": -0.7730597257614136, "num_chars": 32}, {"sum_logits": -13.047228813171387, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -32.494117736816406, "logits_per_token": -2.1745381355285645, "logits_per_char": -0.3261807203292847, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 282, "native_id": "OHAT_2007_8_24", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.381921768188477, "incorrect_loss_raw": 29.052547454833984, "correct_loss_per_char": 0.8211330207618507, "incorrect_loss_per_char": 0.6974967637946671, "correct_loss_per_token": 3.7977402210235596, "incorrect_loss_per_token": 3.8299891759478855, "correct_loss_uncond": -14.209943771362305, "incorrect_loss_uncond": -13.419269561767578}, "model_output": [{"sum_logits": -30.381921768188477, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -44.59186553955078, "logits_per_token": -3.7977402210235596, "logits_per_char": -0.8211330207618507, "num_chars": 37}, {"sum_logits": -30.275413513183594, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -43.84064483642578, "logits_per_token": -3.3639348347981772, "logits_per_char": -0.6880775798450817, "num_chars": 44}, {"sum_logits": -28.93453598022461, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -48.58012390136719, "logits_per_token": -4.133505140032087, "logits_per_char": -0.7057203897615758, "num_chars": 41}, {"sum_logits": -27.94769287109375, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -34.99468231201172, "logits_per_token": -3.992527553013393, "logits_per_char": -0.6986923217773438, "num_chars": 40}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 283, "native_id": "Mercury_188335", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 28.897308349609375, "incorrect_loss_raw": 26.12742869059245, "correct_loss_per_char": 0.7409566243489584, "incorrect_loss_per_char": 0.8631449734723127, "correct_loss_per_token": 3.612163543701172, "incorrect_loss_per_token": 3.8983542124430346, "correct_loss_uncond": -17.391918182373047, "incorrect_loss_uncond": -11.432393391927084}, "model_output": [{"sum_logits": -37.704139709472656, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -45.62189483642578, "logits_per_token": -4.189348856608073, "logits_per_char": -0.8378697713216146, "num_chars": 45}, {"sum_logits": -28.897308349609375, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -46.28922653198242, "logits_per_token": -3.612163543701172, "logits_per_char": -0.7409566243489584, "num_chars": 39}, {"sum_logits": -21.780681610107422, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -35.090782165527344, "logits_per_token": -4.356136322021484, "logits_per_char": -0.80669191148546, "num_chars": 27}, {"sum_logits": -18.897464752197266, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -31.96678924560547, "logits_per_token": -3.1495774586995444, "logits_per_char": -0.9448732376098633, "num_chars": 20}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 284, "native_id": "Mercury_7128555", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.553108215332031, "incorrect_loss_raw": 14.635945320129395, "correct_loss_per_char": 0.27286154763740406, "incorrect_loss_per_char": 0.27863575163341703, "correct_loss_per_token": 1.5553108215332032, "incorrect_loss_per_token": 1.340445711636784, "correct_loss_uncond": -24.868873596191406, "incorrect_loss_uncond": -25.32094605763753}, "model_output": [{"sum_logits": -14.297538757324219, "num_tokens": 10, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -39.509769439697266, "logits_per_token": -1.429753875732422, "logits_per_char": -0.25531319209507536, "num_chars": 56}, {"sum_logits": -15.553108215332031, "num_tokens": 10, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -40.42198181152344, "logits_per_token": -1.5553108215332032, "logits_per_char": -0.27286154763740406, "num_chars": 57}, {"sum_logits": -13.234576225280762, "num_tokens": 12, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -42.9473876953125, "logits_per_token": -1.1028813521067302, "logits_per_char": -0.25950149461334826, "num_chars": 51}, {"sum_logits": -16.375720977783203, "num_tokens": 11, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -37.413516998291016, "logits_per_token": -1.4887019070712002, "logits_per_char": -0.3210925681918275, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 285, "native_id": "Mercury_407517", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.848638534545898, "incorrect_loss_raw": 5.913407802581787, "correct_loss_per_char": 2.616212844848633, "incorrect_loss_per_char": 1.971135934193929, "correct_loss_per_token": 7.848638534545898, "incorrect_loss_per_token": 5.913407802581787, "correct_loss_uncond": -1.2248868942260742, "incorrect_loss_uncond": -2.8249452908833823}, "model_output": [{"sum_logits": -4.037868976593018, "num_tokens": 1, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -7.79471492767334, "logits_per_token": -4.037868976593018, "logits_per_char": -1.3459563255310059, "num_chars": 3}, {"sum_logits": -7.000524520874023, "num_tokens": 1, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -8.455670356750488, "logits_per_token": -7.000524520874023, "logits_per_char": -2.3335081736246743, "num_chars": 3}, {"sum_logits": -7.848638534545898, "num_tokens": 1, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -9.073525428771973, "logits_per_token": -7.848638534545898, "logits_per_char": -2.616212844848633, "num_chars": 3}, {"sum_logits": -6.70182991027832, "num_tokens": 1, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -9.96467399597168, "logits_per_token": -6.70182991027832, "logits_per_char": -2.233943303426107, "num_chars": 3}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 286, "native_id": "Mercury_405950", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 20.030872344970703, "incorrect_loss_raw": 22.444650014241535, "correct_loss_per_char": 0.4354537466297979, "incorrect_loss_per_char": 0.49004361283629505, "correct_loss_per_token": 2.503859043121338, "incorrect_loss_per_token": 2.8383000479804146, "correct_loss_uncond": -12.541252136230469, "incorrect_loss_uncond": -9.256090799967447}, "model_output": [{"sum_logits": -22.50484275817871, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -33.48390197753906, "logits_per_token": -3.7508071263631186, "logits_per_char": -0.5488986038580174, "num_chars": 41}, {"sum_logits": -25.306406021118164, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -33.287994384765625, "logits_per_token": -2.8118228912353516, "logits_per_char": -0.5384341706620887, "num_chars": 47}, {"sum_logits": -20.030872344970703, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.57212448120117, "logits_per_token": -2.503859043121338, "logits_per_char": -0.4354537466297979, "num_chars": 46}, {"sum_logits": -19.522701263427734, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -28.330326080322266, "logits_per_token": -1.9522701263427735, "logits_per_char": -0.3827980639887791, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 287, "native_id": "MCAS_2004_9_5", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.042046546936035, "incorrect_loss_raw": 7.828592777252197, "correct_loss_per_char": 0.2133953499071526, "incorrect_loss_per_char": 0.26157580110533485, "correct_loss_per_token": 0.8802558183670044, "incorrect_loss_per_token": 1.1669793810163227, "correct_loss_uncond": -20.787699699401855, "incorrect_loss_uncond": -18.785401185353596}, "model_output": [{"sum_logits": -9.588420867919922, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -28.868358612060547, "logits_per_token": -1.369774409702846, "logits_per_char": -0.3306352023420663, "num_chars": 29}, {"sum_logits": -6.12473201751709, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -26.70081901550293, "logits_per_token": -1.0207886695861816, "logits_per_char": -0.2111976557764514, "num_chars": 29}, {"sum_logits": -7.042046546936035, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.82974624633789, "logits_per_token": -0.8802558183670044, "logits_per_char": -0.2133953499071526, "num_chars": 33}, {"sum_logits": -7.77262544631958, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -24.272804260253906, "logits_per_token": -1.11037506375994, "logits_per_char": -0.24289454519748688, "num_chars": 32}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 288, "native_id": "NCEOGA_2013_8_28", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.174948692321777, "incorrect_loss_raw": 14.835853576660156, "correct_loss_per_char": 0.6163021170574686, "incorrect_loss_per_char": 0.6376365156345117, "correct_loss_per_token": 2.3624914487202964, "incorrect_loss_per_token": 2.2161811571272594, "correct_loss_uncond": -8.7518892288208, "incorrect_loss_uncond": -8.774772644042969}, "model_output": [{"sum_logits": -12.193461418151855, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -23.21308708190918, "logits_per_token": -2.032243569691976, "logits_per_char": -0.7172624363618738, "num_chars": 17}, {"sum_logits": -14.174948692321777, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -22.926837921142578, "logits_per_token": -2.3624914487202964, "logits_per_char": -0.6163021170574686, "num_chars": 23}, {"sum_logits": -17.839824676513672, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.392141342163086, "logits_per_token": -2.548546382359096, "logits_per_char": -0.7433260281880697, "num_chars": 24}, {"sum_logits": -14.474274635314941, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -24.22665023803711, "logits_per_token": -2.0677535193307057, "logits_per_char": -0.4523210823535919, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 289, "native_id": "Mercury_SC_406451", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.790573120117188, "incorrect_loss_raw": 22.820199330647785, "correct_loss_per_char": 0.4101822951744343, "incorrect_loss_per_char": 0.4297241891674723, "correct_loss_per_token": 2.162779374556108, "incorrect_loss_per_token": 2.1795863045586485, "correct_loss_uncond": -14.364112854003906, "incorrect_loss_uncond": -15.256659825642904}, "model_output": [{"sum_logits": -20.009017944335938, "num_tokens": 8, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -30.030576705932617, "logits_per_token": -2.501127243041992, "logits_per_char": -0.4446448432074653, "num_chars": 45}, {"sum_logits": -24.420917510986328, "num_tokens": 12, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -40.4710578918457, "logits_per_token": -2.035076459248861, "logits_per_char": -0.44401668201793326, "num_chars": 55}, {"sum_logits": -23.790573120117188, "num_tokens": 11, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -38.154685974121094, "logits_per_token": -2.162779374556108, "logits_per_char": -0.4101822951744343, "num_chars": 58}, {"sum_logits": -24.030662536621094, "num_tokens": 12, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -43.72894287109375, "logits_per_token": -2.0025552113850913, "logits_per_char": -0.4005110422770182, "num_chars": 60}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 290, "native_id": "Mercury_7109323", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 40.917999267578125, "incorrect_loss_raw": 28.142289479573567, "correct_loss_per_char": 0.6017352833467371, "incorrect_loss_per_char": 0.5522976920981489, "correct_loss_per_token": 3.4098332722981772, "incorrect_loss_per_token": 2.991257314970999, "correct_loss_uncond": -13.97369384765625, "incorrect_loss_uncond": -11.195964813232422}, "model_output": [{"sum_logits": -18.643484115600586, "num_tokens": 7, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -32.649742126464844, "logits_per_token": -2.6633548736572266, "logits_per_char": -0.454719124770746, "num_chars": 41}, {"sum_logits": -36.312034606933594, "num_tokens": 10, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -43.522361755371094, "logits_per_token": -3.631203460693359, "logits_per_char": -0.6851327284327093, "num_chars": 53}, {"sum_logits": -29.471349716186523, "num_tokens": 11, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -41.84265899658203, "logits_per_token": -2.679213610562411, "logits_per_char": -0.5170412230909917, "num_chars": 57}, {"sum_logits": -40.917999267578125, "num_tokens": 12, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -54.891693115234375, "logits_per_token": -3.4098332722981772, "logits_per_char": -0.6017352833467371, "num_chars": 68}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 291, "native_id": "Mercury_404132", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 25.18553352355957, "incorrect_loss_raw": 30.819605509440105, "correct_loss_per_char": 0.8124365652761152, "incorrect_loss_per_char": 1.2038970103209046, "correct_loss_per_token": 1.3255543959768195, "incorrect_loss_per_token": 2.3635430784306974, "correct_loss_uncond": -13.067975997924805, "incorrect_loss_uncond": -9.418827056884766}, "model_output": [{"sum_logits": -33.51374435424805, "num_tokens": 13, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -43.8732795715332, "logits_per_token": -2.5779803349421573, "logits_per_char": -1.1556463570430362, "num_chars": 29}, {"sum_logits": -25.18553352355957, "num_tokens": 19, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -38.253509521484375, "logits_per_token": -1.3255543959768195, "logits_per_char": -0.8124365652761152, "num_chars": 31}, {"sum_logits": -34.97864532470703, "num_tokens": 12, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -42.055625915527344, "logits_per_token": -2.9148871103922525, "logits_per_char": -1.4574435551961262, "num_chars": 24}, {"sum_logits": -23.966426849365234, "num_tokens": 15, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -34.78639221191406, "logits_per_token": -1.5977617899576824, "logits_per_char": -0.9986011187235514, "num_chars": 24}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 292, "native_id": "Mercury_7210210", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.675351858139038, "incorrect_loss_raw": 4.18775995572408, "correct_loss_per_char": 0.7350703716278076, "incorrect_loss_per_char": 0.5721003902021539, "correct_loss_per_token": 3.675351858139038, "incorrect_loss_per_token": 4.18775995572408, "correct_loss_uncond": -8.788313150405884, "incorrect_loss_uncond": -7.932293017705281}, "model_output": [{"sum_logits": -4.4652862548828125, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -4.4652862548828125, "logits_per_char": -0.4961429172092014, "num_chars": 9}, {"sum_logits": -2.6586849689483643, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -2.6586849689483643, "logits_per_char": -0.44311416149139404, "num_chars": 6}, {"sum_logits": -3.675351858139038, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -3.675351858139038, "logits_per_char": -0.7350703716278076, "num_chars": 5}, {"sum_logits": -5.4393086433410645, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.445188522338867, "logits_per_token": -5.4393086433410645, "logits_per_char": -0.7770440919058663, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 293, "native_id": "Mercury_SC_408042", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.8451621532440186, "incorrect_loss_raw": 5.574385007222493, "correct_loss_per_char": 0.26359459332057406, "incorrect_loss_per_char": 0.8528978890842862, "correct_loss_per_token": 1.8451621532440186, "incorrect_loss_per_token": 5.574385007222493, "correct_loss_uncond": -11.013056993484497, "incorrect_loss_uncond": -8.014017422993978}, "model_output": [{"sum_logits": -5.483972072601318, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.715499877929688, "logits_per_token": -5.483972072601318, "logits_per_char": -0.6854965090751648, "num_chars": 8}, {"sum_logits": -1.8451621532440186, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -12.858219146728516, "logits_per_token": -1.8451621532440186, "logits_per_char": -0.26359459332057406, "num_chars": 7}, {"sum_logits": -5.636645793914795, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.456459045410156, "logits_per_token": -5.636645793914795, "logits_per_char": -0.9394409656524658, "num_chars": 6}, {"sum_logits": -5.602537155151367, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.59324836730957, "logits_per_token": -5.602537155151367, "logits_per_char": -0.9337561925252279, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 294, "native_id": "MCAS_2004_8_14", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 1.9732449054718018, "incorrect_loss_raw": 6.607206344604492, "correct_loss_per_char": 0.328874150911967, "incorrect_loss_per_char": 1.1432388067245485, "correct_loss_per_token": 1.9732449054718018, "incorrect_loss_per_token": 5.136277198791504, "correct_loss_uncond": -10.093710660934448, "incorrect_loss_uncond": -5.675860404968262}, "model_output": [{"sum_logits": -3.2855515480041504, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -1.6427757740020752, "logits_per_char": -0.21903676986694337, "num_chars": 15}, {"sum_logits": -5.540023326873779, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -14.483652114868164, "logits_per_token": -2.7700116634368896, "logits_per_char": -0.46166861057281494, "num_chars": 12}, {"sum_logits": -1.9732449054718018, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.06695556640625, "logits_per_token": -1.9732449054718018, "logits_per_char": -0.328874150911967, "num_chars": 6}, {"sum_logits": -10.996044158935547, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -10.11196231842041, "logits_per_token": -10.996044158935547, "logits_per_char": -2.7490110397338867, "num_chars": 4}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 295, "native_id": "TIMSS_2011_4_pg5", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.910177230834961, "incorrect_loss_raw": 7.951625029246013, "correct_loss_per_char": 0.9820354461669922, "incorrect_loss_per_char": 1.7160353501637777, "correct_loss_per_token": 4.910177230834961, "incorrect_loss_per_token": 7.951625029246013, "correct_loss_uncond": -9.1380615234375, "incorrect_loss_uncond": -4.776804129282634}, "model_output": [{"sum_logits": -5.826979160308838, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.686424255371094, "logits_per_token": -5.826979160308838, "logits_per_char": -1.1653958320617677, "num_chars": 5}, {"sum_logits": -4.910177230834961, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.048238754272461, "logits_per_token": -4.910177230834961, "logits_per_char": -0.9820354461669922, "num_chars": 5}, {"sum_logits": -7.542620658874512, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.549814224243164, "logits_per_token": -7.542620658874512, "logits_per_char": -1.885655164718628, "num_chars": 4}, {"sum_logits": -10.485275268554688, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.94904899597168, "logits_per_token": -10.485275268554688, "logits_per_char": -2.0970550537109376, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 296, "native_id": "Mercury_SC_406833", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.83249282836914, "incorrect_loss_raw": 21.307484308878582, "correct_loss_per_char": 0.3816933170441658, "incorrect_loss_per_char": 0.7294699812448152, "correct_loss_per_token": 1.6903561183384486, "incorrect_loss_per_token": 3.376018266829233, "correct_loss_uncond": -8.783973693847656, "incorrect_loss_uncond": -4.181131998697917}, "model_output": [{"sum_logits": -22.078868865966797, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -27.801782608032227, "logits_per_token": -3.1541241237095425, "logits_per_char": -0.7885310309273856, "num_chars": 28}, {"sum_logits": -14.491514205932617, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -18.839929580688477, "logits_per_token": -2.415252367655436, "logits_per_char": -0.5175540787833077, "num_chars": 28}, {"sum_logits": -11.83249282836914, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -20.616466522216797, "logits_per_token": -1.6903561183384486, "logits_per_char": -0.3816933170441658, "num_chars": 31}, {"sum_logits": -27.352069854736328, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.82413673400879, "logits_per_token": -4.558678309122722, "logits_per_char": -0.8823248340237525, "num_chars": 31}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 297, "native_id": "Mercury_7029558", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.611096382141113, "incorrect_loss_raw": 7.991592566172282, "correct_loss_per_char": 0.5529093515305292, "incorrect_loss_per_char": 0.5478753160547327, "correct_loss_per_token": 2.9027740955352783, "incorrect_loss_per_token": 3.5642271836598716, "correct_loss_uncond": -16.78341579437256, "incorrect_loss_uncond": -11.677745660146078}, "model_output": [{"sum_logits": -4.249899864196777, "num_tokens": 1, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -13.984064102172852, "logits_per_token": -4.249899864196777, "logits_per_char": -0.4722110960218642, "num_chars": 9}, {"sum_logits": -6.046248912811279, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -16.214397430419922, "logits_per_token": -3.0231244564056396, "logits_per_char": -0.5496589920737527, "num_chars": 11}, {"sum_logits": -11.611096382141113, "num_tokens": 4, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -28.394512176513672, "logits_per_token": -2.9027740955352783, "logits_per_char": -0.5529093515305292, "num_chars": 21}, {"sum_logits": -13.678628921508789, "num_tokens": 4, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -28.809553146362305, "logits_per_token": -3.4196572303771973, "logits_per_char": -0.6217558600685813, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 298, "native_id": "Mercury_7138390", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.324748992919922, "incorrect_loss_raw": 12.157245000203451, "correct_loss_per_char": 0.3145763609144423, "incorrect_loss_per_char": 0.3252593239380835, "correct_loss_per_token": 1.4155936241149902, "incorrect_loss_per_token": 1.6488426526387532, "correct_loss_uncond": -13.859560012817383, "incorrect_loss_uncond": -14.256916681925455}, "model_output": [{"sum_logits": -14.768314361572266, "num_tokens": 8, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -27.661380767822266, "logits_per_token": -1.8460392951965332, "logits_per_char": -0.3886398516203228, "num_chars": 38}, {"sum_logits": -11.324748992919922, "num_tokens": 8, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -25.184309005737305, "logits_per_token": -1.4155936241149902, "logits_per_char": -0.3145763609144423, "num_chars": 36}, {"sum_logits": -10.762517929077148, "num_tokens": 7, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -24.877721786499023, "logits_per_token": -1.5375025612967355, "logits_per_char": -0.283224156028346, "num_chars": 38}, {"sum_logits": -10.940902709960938, "num_tokens": 7, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -26.70338249206543, "logits_per_token": -1.562986101422991, "logits_per_char": -0.3039139641655816, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 299, "native_id": "MEAP_2005_5_12", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.892183303833008, "incorrect_loss_raw": 21.168819427490234, "correct_loss_per_char": 0.4723045825958252, "incorrect_loss_per_char": 0.3064816355914504, "correct_loss_per_token": 2.0991314782036676, "incorrect_loss_per_token": 1.3775062617133644, "correct_loss_uncond": -22.5932674407959, "incorrect_loss_uncond": -23.79986572265625}, "model_output": [{"sum_logits": -18.892183303833008, "num_tokens": 9, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -41.485450744628906, "logits_per_token": -2.0991314782036676, "logits_per_char": -0.4723045825958252, "num_chars": 40}, {"sum_logits": -14.699804306030273, "num_tokens": 10, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -32.83073425292969, "logits_per_token": -1.4699804306030273, "logits_per_char": -0.2999960062455158, "num_chars": 49}, {"sum_logits": -25.183307647705078, "num_tokens": 17, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -52.03804397583008, "logits_per_token": -1.4813710381002987, "logits_per_char": -0.3313593111540142, "num_chars": 76}, {"sum_logits": -23.62334632873535, "num_tokens": 20, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -50.03727722167969, "logits_per_token": -1.1811673164367675, "logits_per_char": -0.2880895893748214, "num_chars": 82}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 300, "native_id": "MCAS_2000_4_30", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.894133567810059, "incorrect_loss_raw": 5.2064032554626465, "correct_loss_per_char": 1.2411777973175049, "incorrect_loss_per_char": 0.6204176645430307, "correct_loss_per_token": 3.7235333919525146, "incorrect_loss_per_token": 5.2064032554626465, "correct_loss_uncond": -3.5662946701049805, "incorrect_loss_uncond": -7.498005390167236}, "model_output": [{"sum_logits": -5.408670425415039, "num_tokens": 1, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -13.97536849975586, "logits_per_token": -5.408670425415039, "logits_per_char": -0.7726672036307198, "num_chars": 7}, {"sum_logits": -6.077867031097412, "num_tokens": 1, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -11.064149856567383, "logits_per_token": -6.077867031097412, "logits_per_char": -0.6753185590108236, "num_chars": 9}, {"sum_logits": -4.132672309875488, "num_tokens": 1, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -13.073707580566406, "logits_per_token": -4.132672309875488, "logits_per_char": -0.4132672309875488, "num_chars": 10}, {"sum_logits": -14.894133567810059, "num_tokens": 4, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -18.46042823791504, "logits_per_token": -3.7235333919525146, "logits_per_char": -1.2411777973175049, "num_chars": 12}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 301, "native_id": "MCAS_1998_4_12", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 17.38568115234375, "incorrect_loss_raw": 24.446921666463215, "correct_loss_per_char": 0.4043181663335756, "incorrect_loss_per_char": 0.8239624122076625, "correct_loss_per_token": 1.5805164683948865, "incorrect_loss_per_token": 2.4561294196029304, "correct_loss_uncond": -22.96636962890625, "incorrect_loss_uncond": -12.977211634318033}, "model_output": [{"sum_logits": -21.689258575439453, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -32.919456481933594, "logits_per_token": -2.4099176194932728, "logits_per_char": -0.7746163776942662, "num_chars": 28}, {"sum_logits": -22.734800338745117, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -40.10077667236328, "logits_per_token": -2.0668000307950107, "logits_per_char": -0.7333806560885522, "num_chars": 31}, {"sum_logits": -17.38568115234375, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -40.35205078125, "logits_per_token": -1.5805164683948865, "logits_per_char": -0.4043181663335756, "num_chars": 43}, {"sum_logits": -28.916706085205078, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -39.252166748046875, "logits_per_token": -2.8916706085205077, "logits_per_char": -0.9638902028401692, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 302, "native_id": "Mercury_175840", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.901615142822266, "incorrect_loss_raw": 14.143818219502768, "correct_loss_per_char": 0.40376352380823205, "incorrect_loss_per_char": 0.48757344491018856, "correct_loss_per_token": 2.7254037857055664, "incorrect_loss_per_token": 2.7397091320582803, "correct_loss_uncond": -15.26895523071289, "incorrect_loss_uncond": -12.611072222391764}, "model_output": [{"sum_logits": -13.12419319152832, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.73447036743164, "logits_per_token": -3.28104829788208, "logits_per_char": -0.4860812293158637, "num_chars": 27}, {"sum_logits": -10.901615142822266, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.170570373535156, "logits_per_token": -2.7254037857055664, "logits_per_char": -0.40376352380823205, "num_chars": 27}, {"sum_logits": -16.15903091430664, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -32.705833435058594, "logits_per_token": -2.3084329877580916, "logits_per_char": -0.4896676034638376, "num_chars": 33}, {"sum_logits": -13.14823055267334, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.82436752319336, "logits_per_token": -2.629646110534668, "logits_per_char": -0.4869715019508644, "num_chars": 27}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 303, "native_id": "Mercury_7099190", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.287261962890625, "incorrect_loss_raw": 17.430687586466473, "correct_loss_per_char": 0.35857376397824753, "incorrect_loss_per_char": 0.3807075992686085, "correct_loss_per_token": 2.285907745361328, "incorrect_loss_per_token": 2.4900982266380676, "correct_loss_uncond": -23.306686401367188, "incorrect_loss_uncond": -18.320327123006184}, "model_output": [{"sum_logits": -16.829635620117188, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -34.44573211669922, "logits_per_token": -2.404233660016741, "logits_per_char": -0.39138687488644625, "num_chars": 43}, {"sum_logits": -18.669321060180664, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -36.548858642578125, "logits_per_token": -2.667045865740095, "logits_per_char": -0.4148738013373481, "num_chars": 45}, {"sum_logits": -18.287261962890625, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -41.59394836425781, "logits_per_token": -2.285907745361328, "logits_per_char": -0.35857376397824753, "num_chars": 51}, {"sum_logits": -16.793106079101562, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -36.258453369140625, "logits_per_token": -2.399015154157366, "logits_per_char": -0.33586212158203127, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 304, "native_id": "Mercury_SC_401605", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.48052215576172, "incorrect_loss_raw": 24.908056259155273, "correct_loss_per_char": 0.7896942630890877, "incorrect_loss_per_char": 0.6787754721334335, "correct_loss_per_token": 4.080087025960286, "incorrect_loss_per_token": 3.987350433591812, "correct_loss_uncond": -6.590358734130859, "incorrect_loss_uncond": -11.771937052408854}, "model_output": [{"sum_logits": -24.48052215576172, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.070880889892578, "logits_per_token": -4.080087025960286, "logits_per_char": -0.7896942630890877, "num_chars": 31}, {"sum_logits": -21.403152465820312, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.7387809753418, "logits_per_token": -3.5671920776367188, "logits_per_char": -0.6295044842888328, "num_chars": 34}, {"sum_logits": -20.663026809692383, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.20781898498535, "logits_per_token": -2.9518609728131975, "logits_per_char": -0.5903721945626396, "num_chars": 35}, {"sum_logits": -32.657989501953125, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -46.093379974365234, "logits_per_token": -5.4429982503255205, "logits_per_char": -0.8164497375488281, "num_chars": 40}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 305, "native_id": "TAKS_2009_5_36", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 0.8217903971672058, "incorrect_loss_raw": 7.296722888946533, "correct_loss_per_char": 0.1369650661945343, "incorrect_loss_per_char": 1.2581950664520265, "correct_loss_per_token": 0.8217903971672058, "incorrect_loss_per_token": 5.621310869852702, "correct_loss_uncond": -11.638279259204865, "incorrect_loss_uncond": -7.318788051605225}, "model_output": [{"sum_logits": -0.8217903971672058, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": true, "sum_logits_uncond": -12.46006965637207, "logits_per_token": -0.8217903971672058, "logits_per_char": -0.1369650661945343, "num_chars": 6}, {"sum_logits": -3.7917160987854004, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -13.01215934753418, "logits_per_token": -3.7917160987854004, "logits_per_char": -0.7583432197570801, "num_chars": 5}, {"sum_logits": -10.052472114562988, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -16.235919952392578, "logits_per_token": -5.026236057281494, "logits_per_char": -2.010494422912598, "num_chars": 5}, {"sum_logits": -8.045980453491211, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -14.598453521728516, "logits_per_token": -8.045980453491211, "logits_per_char": -1.0057475566864014, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 306, "native_id": "Mercury_7171570", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.2383038997650146, "incorrect_loss_raw": 5.31433089574178, "correct_loss_per_char": 0.1492202599843343, "incorrect_loss_per_char": 0.3241840522408433, "correct_loss_per_token": 1.1191519498825073, "incorrect_loss_per_token": 1.8951321575376723, "correct_loss_uncond": -13.617179155349731, "incorrect_loss_uncond": -11.034508546193441}, "model_output": [{"sum_logits": -4.90372371673584, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -16.369611740112305, "logits_per_token": -1.63457457224528, "logits_per_char": -0.2580907219334653, "num_chars": 19}, {"sum_logits": -5.875250339508057, "num_tokens": 4, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -16.82131576538086, "logits_per_token": -1.4688125848770142, "logits_per_char": -0.34560296114753275, "num_chars": 17}, {"sum_logits": -5.164018630981445, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.8555908203125, "logits_per_token": -2.5820093154907227, "logits_per_char": -0.3688584736415318, "num_chars": 14}, {"sum_logits": -2.2383038997650146, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.855483055114746, "logits_per_token": -1.1191519498825073, "logits_per_char": -0.1492202599843343, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 307, "native_id": "Mercury_SC_402057", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.194192886352539, "incorrect_loss_raw": 6.762599627176921, "correct_loss_per_char": 0.34052566245750143, "incorrect_loss_per_char": 0.5204930234838415, "correct_loss_per_token": 1.5323654810587566, "incorrect_loss_per_token": 2.695021311442057, "correct_loss_uncond": -19.63960838317871, "incorrect_loss_uncond": -11.72917366027832}, "model_output": [{"sum_logits": -7.354978561401367, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -14.70699691772461, "logits_per_token": -3.6774892807006836, "logits_per_char": -0.8172198401557075, "num_chars": 9}, {"sum_logits": -4.697478294372559, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -14.88609790802002, "logits_per_token": -2.3487391471862793, "logits_per_char": -0.46974782943725585, "num_chars": 10}, {"sum_logits": -9.194192886352539, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.83380126953125, "logits_per_token": -1.5323654810587566, "logits_per_char": -0.34052566245750143, "num_chars": 27}, {"sum_logits": -8.235342025756836, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -25.882225036621094, "logits_per_token": -2.058835506439209, "logits_per_char": -0.2745114008585612, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 308, "native_id": "Mercury_SC_413628", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.86997127532959, "incorrect_loss_raw": 13.914057413736979, "correct_loss_per_char": 0.5334604336665227, "incorrect_loss_per_char": 0.49452850561951506, "correct_loss_per_token": 2.311661879221598, "incorrect_loss_per_token": 2.584545432196723, "correct_loss_uncond": -12.21335506439209, "incorrect_loss_uncond": -16.678484598795574}, "model_output": [{"sum_logits": -17.843944549560547, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -33.93601989746094, "logits_per_token": -2.9739907582600913, "logits_per_char": -0.5576232671737671, "num_chars": 32}, {"sum_logits": -14.566137313842773, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.758441925048828, "logits_per_token": -2.9132274627685546, "logits_per_char": -0.5202191897800991, "num_chars": 28}, {"sum_logits": -13.86997127532959, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -26.08332633972168, "logits_per_token": -2.311661879221598, "logits_per_char": -0.5334604336665227, "num_chars": 26}, {"sum_logits": -9.332090377807617, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.08316421508789, "logits_per_token": -1.8664180755615234, "logits_per_char": -0.405743059904679, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 309, "native_id": "Mercury_LBS10131", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.8864679336547852, "incorrect_loss_raw": 4.718956708908081, "correct_loss_per_char": 0.17149708487770773, "incorrect_loss_per_char": 0.4024663573934204, "correct_loss_per_token": 1.8864679336547852, "incorrect_loss_per_token": 3.4484763940175376, "correct_loss_uncond": -14.522461891174316, "incorrect_loss_uncond": -9.485027233759562}, "model_output": [{"sum_logits": -2.9727072715759277, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -13.012836456298828, "logits_per_token": -2.9727072715759277, "logits_per_char": -0.2972707271575928, "num_chars": 10}, {"sum_logits": -3.5612809658050537, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -14.035669326782227, "logits_per_token": -3.5612809658050537, "logits_per_char": -0.3237528150731867, "num_chars": 11}, {"sum_logits": -1.8864679336547852, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": true, "sum_logits_uncond": -16.4089298248291, "logits_per_token": -1.8864679336547852, "logits_per_char": -0.17149708487770773, "num_chars": 11}, {"sum_logits": -7.622881889343262, "num_tokens": 2, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -15.563446044921875, "logits_per_token": -3.811440944671631, "logits_per_char": -0.5863755299494817, "num_chars": 13}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 310, "native_id": "Mercury_7032428", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.770648956298828, "incorrect_loss_raw": 13.619033177693685, "correct_loss_per_char": 0.29235496520996096, "incorrect_loss_per_char": 0.40537798949229864, "correct_loss_per_token": 1.4617748260498047, "incorrect_loss_per_token": 1.949589941236708, "correct_loss_uncond": -10.05630111694336, "incorrect_loss_uncond": -14.394749323527018}, "model_output": [{"sum_logits": -8.196529388427734, "num_tokens": 6, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -22.827991485595703, "logits_per_token": -1.3660882314046223, "logits_per_char": -0.3035751625343605, "num_chars": 27}, {"sum_logits": -8.770648956298828, "num_tokens": 6, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -18.826950073242188, "logits_per_token": -1.4617748260498047, "logits_per_char": -0.29235496520996096, "num_chars": 30}, {"sum_logits": -15.367128372192383, "num_tokens": 6, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -29.42688751220703, "logits_per_token": -2.561188062032064, "logits_per_char": -0.48022276163101196, "num_chars": 32}, {"sum_logits": -17.293441772460938, "num_tokens": 9, "num_tokens_all": 281, "is_greedy": false, "sum_logits_uncond": -31.786468505859375, "logits_per_token": -1.9214935302734375, "logits_per_char": -0.43233604431152345, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 311, "native_id": "Mercury_7025008", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.677982330322266, "incorrect_loss_raw": 27.28039296468099, "correct_loss_per_char": 0.7086869478225708, "incorrect_loss_per_char": 0.6038573004398275, "correct_loss_per_token": 2.834747791290283, "incorrect_loss_per_token": 2.99593500384578, "correct_loss_uncond": -8.978271484375, "incorrect_loss_uncond": -9.219989776611328}, "model_output": [{"sum_logits": -22.677982330322266, "num_tokens": 8, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -31.656253814697266, "logits_per_token": -2.834747791290283, "logits_per_char": -0.7086869478225708, "num_chars": 32}, {"sum_logits": -21.37284278869629, "num_tokens": 8, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -28.951534271240234, "logits_per_token": -2.671605348587036, "logits_per_char": -0.49704285555107647, "num_chars": 43}, {"sum_logits": -36.22539138793945, "num_tokens": 10, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -45.702964782714844, "logits_per_token": -3.6225391387939454, "logits_per_char": -0.7875085084334664, "num_chars": 46}, {"sum_logits": -24.242944717407227, "num_tokens": 9, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -34.846649169921875, "logits_per_token": -2.6936605241563587, "logits_per_char": -0.5270205373349397, "num_chars": 46}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 312, "native_id": "MEA_2011_8_19", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.507447719573975, "incorrect_loss_raw": 8.217807451883951, "correct_loss_per_char": 1.1268619298934937, "incorrect_loss_per_char": 1.3303797907299464, "correct_loss_per_token": 4.507447719573975, "incorrect_loss_per_token": 8.217807451883951, "correct_loss_uncond": -6.66514253616333, "incorrect_loss_uncond": -3.686786333719889}, "model_output": [{"sum_logits": -9.489422798156738, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -12.094038009643555, "logits_per_token": -9.489422798156738, "logits_per_char": -1.1861778497695923, "num_chars": 8}, {"sum_logits": -6.835151672363281, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -11.89634895324707, "logits_per_token": -6.835151672363281, "logits_per_char": -1.1391919453938801, "num_chars": 6}, {"sum_logits": -8.328847885131836, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -11.723394393920898, "logits_per_token": -8.328847885131836, "logits_per_char": -1.6657695770263672, "num_chars": 5}, {"sum_logits": -4.507447719573975, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -11.172590255737305, "logits_per_token": -4.507447719573975, "logits_per_char": -1.1268619298934937, "num_chars": 4}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 313, "native_id": "NYSEDREGENTS_2008_8_27", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.049177169799805, "incorrect_loss_raw": 10.245427449544271, "correct_loss_per_char": 2.209835433959961, "incorrect_loss_per_char": 2.2084335486094155, "correct_loss_per_token": 5.524588584899902, "incorrect_loss_per_token": 5.122713724772136, "correct_loss_uncond": -4.853443145751953, "incorrect_loss_uncond": -5.032412528991699}, "model_output": [{"sum_logits": -9.560883522033691, "num_tokens": 2, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -14.716531753540039, "logits_per_token": -4.780441761016846, "logits_per_char": -2.390220880508423, "num_chars": 4}, {"sum_logits": -8.947006225585938, "num_tokens": 2, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -15.220071792602539, "logits_per_token": -4.473503112792969, "logits_per_char": -1.7894012451171875, "num_chars": 5}, {"sum_logits": -11.049177169799805, "num_tokens": 2, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -15.902620315551758, "logits_per_token": -5.524588584899902, "logits_per_char": -2.209835433959961, "num_chars": 5}, {"sum_logits": -12.228392601013184, "num_tokens": 2, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -15.896916389465332, "logits_per_token": -6.114196300506592, "logits_per_char": -2.4456785202026365, "num_chars": 5}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 314, "native_id": "VASoL_2007_5_22", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.008977890014648, "incorrect_loss_raw": 11.922823905944824, "correct_loss_per_char": 0.35320523205925436, "incorrect_loss_per_char": 0.3536075229195232, "correct_loss_per_token": 1.501122236251831, "incorrect_loss_per_token": 1.490352988243103, "correct_loss_uncond": -21.305124282836914, "incorrect_loss_uncond": -22.753631273905437}, "model_output": [{"sum_logits": -13.910371780395508, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -38.496788024902344, "logits_per_token": -1.7387964725494385, "logits_per_char": -0.4215264175877427, "num_chars": 33}, {"sum_logits": -12.008977890014648, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -33.31410217285156, "logits_per_token": -1.501122236251831, "logits_per_char": -0.35320523205925436, "num_chars": 34}, {"sum_logits": -9.135923385620117, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -31.641372680664062, "logits_per_token": -1.1419904232025146, "logits_per_char": -0.2537756496005588, "num_chars": 36}, {"sum_logits": -12.722176551818848, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -33.891204833984375, "logits_per_token": -1.590272068977356, "logits_per_char": -0.3855205015702681, "num_chars": 33}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 315, "native_id": "NCEOGA_2013_5_19", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.5466086864471436, "incorrect_loss_raw": 8.055569012959799, "correct_loss_per_char": 0.3224189714951949, "incorrect_loss_per_char": 0.31343380534459675, "correct_loss_per_token": 1.7733043432235718, "incorrect_loss_per_token": 1.813779460059272, "correct_loss_uncond": -14.800479173660278, "incorrect_loss_uncond": -16.477707862854004}, "model_output": [{"sum_logits": -3.5466086864471436, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -18.347087860107422, "logits_per_token": -1.7733043432235718, "logits_per_char": -0.3224189714951949, "num_chars": 11}, {"sum_logits": -11.192671775817871, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -28.199365615844727, "logits_per_token": -1.8654452959696453, "logits_per_char": -0.3997382777077811, "num_chars": 28}, {"sum_logits": -4.24066162109375, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -18.564794540405273, "logits_per_token": -2.120330810546875, "logits_per_char": -0.24945068359375, "num_chars": 17}, {"sum_logits": -8.733373641967773, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -26.835670471191406, "logits_per_token": -1.4555622736612956, "logits_per_char": -0.2911124547322591, "num_chars": 30}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 316, "native_id": "Mercury_7037555", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.168197631835938, "incorrect_loss_raw": 19.52783139546712, "correct_loss_per_char": 0.503656470257303, "incorrect_loss_per_char": 0.47824921044430474, "correct_loss_per_token": 3.8613662719726562, "incorrect_loss_per_token": 3.026914898554484, "correct_loss_uncond": -17.923538208007812, "incorrect_loss_uncond": -17.39892514546712}, "model_output": [{"sum_logits": -23.43743896484375, "num_tokens": 5, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -32.94674301147461, "logits_per_token": -4.68748779296875, "logits_per_char": -0.7324199676513672, "num_chars": 32}, {"sum_logits": -15.008909225463867, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.386775970458984, "logits_per_token": -1.8761136531829834, "logits_per_char": -0.34904440059218295, "num_chars": 43}, {"sum_logits": -23.168197631835938, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -41.09173583984375, "logits_per_token": -3.8613662719726562, "logits_per_char": -0.503656470257303, "num_chars": 46}, {"sum_logits": -20.13714599609375, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -42.44675064086914, "logits_per_token": -2.5171432495117188, "logits_per_char": -0.35328326308936403, "num_chars": 57}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 317, "native_id": "Mercury_402132", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.7191162109375, "incorrect_loss_raw": 28.18543243408203, "correct_loss_per_char": 0.5248799226721939, "incorrect_loss_per_char": 0.5556237166276886, "correct_loss_per_token": 2.8576795789930554, "incorrect_loss_per_token": 2.9005109302581302, "correct_loss_uncond": -16.99292755126953, "incorrect_loss_uncond": -23.608160654703777}, "model_output": [{"sum_logits": -21.925893783569336, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -46.332603454589844, "logits_per_token": -3.132270540509905, "logits_per_char": -0.47664986486020294, "num_chars": 46}, {"sum_logits": -25.7191162109375, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -42.71204376220703, "logits_per_token": -2.8576795789930554, "logits_per_char": -0.5248799226721939, "num_chars": 49}, {"sum_logits": -21.00371742248535, "num_tokens": 10, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -50.61278533935547, "logits_per_token": -2.100371742248535, "logits_per_char": -0.4468876047337309, "num_chars": 47}, {"sum_logits": -41.626686096191406, "num_tokens": 12, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -58.43539047241211, "logits_per_token": -3.4688905080159507, "logits_per_char": -0.7433336802891323, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 318, "native_id": "MCAS_2006_8_24", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.007537841796875, "incorrect_loss_raw": 17.24016221364339, "correct_loss_per_char": 0.36010050455729165, "incorrect_loss_per_char": 0.23606181164011839, "correct_loss_per_token": 1.8005025227864584, "incorrect_loss_per_token": 1.0141271890378467, "correct_loss_uncond": -12.767135620117188, "incorrect_loss_uncond": -18.390603065490723}, "model_output": [{"sum_logits": -27.007537841796875, "num_tokens": 15, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -39.77467346191406, "logits_per_token": -1.8005025227864584, "logits_per_char": -0.36010050455729165, "num_chars": 75}, {"sum_logits": -18.932287216186523, "num_tokens": 17, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -35.130218505859375, "logits_per_token": -1.1136639538933248, "logits_per_char": -0.2743809741476308, "num_chars": 69}, {"sum_logits": -15.308808326721191, "num_tokens": 17, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -37.07881546020508, "logits_per_token": -0.9005181368659524, "logits_per_char": -0.20970970310576975, "num_chars": 73}, {"sum_logits": -17.47939109802246, "num_tokens": 17, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -34.68326187133789, "logits_per_token": -1.0281994763542623, "logits_per_char": -0.22409475766695464, "num_chars": 78}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 319, "native_id": "Mercury_7128923", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.07785701751709, "incorrect_loss_raw": 5.235362887382507, "correct_loss_per_char": 1.015571403503418, "incorrect_loss_per_char": 0.9346044076813592, "correct_loss_per_token": 5.07785701751709, "incorrect_loss_per_token": 5.235362887382507, "correct_loss_uncond": -7.724581718444824, "incorrect_loss_uncond": -6.541147351264954}, "model_output": [{"sum_logits": -5.583953380584717, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.236173629760742, "logits_per_token": -5.583953380584717, "logits_per_char": -1.1167906761169433, "num_chars": 5}, {"sum_logits": -5.07785701751709, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.802438735961914, "logits_per_token": -5.07785701751709, "logits_per_char": -1.015571403503418, "num_chars": 5}, {"sum_logits": -8.707015991210938, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.204362869262695, "logits_per_token": -8.707015991210938, "logits_per_char": -1.4511693318684895, "num_chars": 6}, {"sum_logits": -1.4151192903518677, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": true, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -1.4151192903518677, "logits_per_char": -0.23585321505864462, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 320, "native_id": "Mercury_416379", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.03786277770996, "incorrect_loss_raw": 20.871618906656902, "correct_loss_per_char": 0.4292475496019636, "incorrect_loss_per_char": 0.4358530710444744, "correct_loss_per_token": 2.6708736419677734, "incorrect_loss_per_token": 2.9466923204048605, "correct_loss_uncond": -12.284463882446289, "incorrect_loss_uncond": -15.672555287679037}, "model_output": [{"sum_logits": -24.03786277770996, "num_tokens": 9, "num_tokens_all": 251, "is_greedy": false, "sum_logits_uncond": -36.32232666015625, "logits_per_token": -2.6708736419677734, "logits_per_char": -0.4292475496019636, "num_chars": 56}, {"sum_logits": -22.189022064208984, "num_tokens": 9, "num_tokens_all": 251, "is_greedy": false, "sum_logits_uncond": -40.78929901123047, "logits_per_token": -2.4654468960232205, "logits_per_char": -0.4109078160038701, "num_chars": 54}, {"sum_logits": -15.246379852294922, "num_tokens": 7, "num_tokens_all": 249, "is_greedy": false, "sum_logits_uncond": -32.38340759277344, "logits_per_token": -2.1780542646135603, "logits_per_char": -0.324391060687126, "num_chars": 47}, {"sum_logits": -25.179454803466797, "num_tokens": 6, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -36.459815979003906, "logits_per_token": -4.1965758005778, "logits_per_char": -0.5722603364424272, "num_chars": 44}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 321, "native_id": "Mercury_7168053", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.6681809425354, "incorrect_loss_raw": 6.385137557983398, "correct_loss_per_char": 0.6061982675032183, "incorrect_loss_per_char": 0.5722700887256198, "correct_loss_per_token": 3.3340904712677, "incorrect_loss_per_token": 3.578719907336765, "correct_loss_uncond": -10.128282070159912, "incorrect_loss_uncond": -9.27037493387858}, "model_output": [{"sum_logits": -4.830707550048828, "num_tokens": 1, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -15.18421745300293, "logits_per_token": -4.830707550048828, "logits_per_char": -0.6038384437561035, "num_chars": 8}, {"sum_logits": -7.541402339935303, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -15.631242752075195, "logits_per_token": -2.513800779978434, "logits_per_char": -0.6284501949946085, "num_chars": 12}, {"sum_logits": -6.6681809425354, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -16.796463012695312, "logits_per_token": -3.3340904712677, "logits_per_char": -0.6061982675032183, "num_chars": 11}, {"sum_logits": -6.7833027839660645, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -16.151077270507812, "logits_per_token": -3.3916513919830322, "logits_per_char": -0.48452162742614746, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 322, "native_id": "AKDE&ED_2008_8_1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.624336242675781, "incorrect_loss_raw": 19.626341183980305, "correct_loss_per_char": 0.2601171957479941, "incorrect_loss_per_char": 0.422307801736574, "correct_loss_per_token": 1.9248672485351563, "incorrect_loss_per_token": 3.0640765283473583, "correct_loss_uncond": -19.73826026916504, "incorrect_loss_uncond": -17.239349047342937}, "model_output": [{"sum_logits": -9.624336242675781, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -29.36259651184082, "logits_per_token": -1.9248672485351563, "logits_per_char": -0.2601171957479941, "num_chars": 37}, {"sum_logits": -14.230317115783691, "num_tokens": 4, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -31.060152053833008, "logits_per_token": -3.557579278945923, "logits_per_char": -0.36487992604573566, "num_chars": 39}, {"sum_logits": -21.221012115478516, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -38.44062423706055, "logits_per_token": -3.031573159354074, "logits_per_char": -0.45151089607401096, "num_chars": 47}, {"sum_logits": -23.42769432067871, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -41.09629440307617, "logits_per_token": -2.603077146742079, "logits_per_char": -0.45053258308997524, "num_chars": 52}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 323, "native_id": "Mercury_SC_415476", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.510135650634766, "incorrect_loss_raw": 21.46112887064616, "correct_loss_per_char": 0.7363926569620768, "incorrect_loss_per_char": 0.6596355892758977, "correct_loss_per_token": 3.7871622358049666, "incorrect_loss_per_token": 3.5169536943788877, "correct_loss_uncond": -16.616920471191406, "incorrect_loss_uncond": -14.503431955973307}, "model_output": [{"sum_logits": -18.729873657226562, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.52471160888672, "logits_per_token": -2.0810970730251737, "logits_per_char": -0.5062128015466638, "num_chars": 37}, {"sum_logits": -26.510135650634766, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -43.12705612182617, "logits_per_token": -3.7871622358049666, "logits_per_char": -0.7363926569620768, "num_chars": 36}, {"sum_logits": -25.825355529785156, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -38.561553955078125, "logits_per_token": -5.165071105957031, "logits_per_char": -0.8330759848317792, "num_chars": 31}, {"sum_logits": -19.828157424926758, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -31.807416915893555, "logits_per_token": -3.3046929041544595, "logits_per_char": -0.6396179814492503, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 324, "native_id": "Mercury_7106960", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.132795572280884, "incorrect_loss_raw": 4.390058676401774, "correct_loss_per_char": 0.4265591144561768, "incorrect_loss_per_char": 0.7880877706739637, "correct_loss_per_token": 2.132795572280884, "incorrect_loss_per_token": 4.390058676401774, "correct_loss_uncond": -10.330869436264038, "incorrect_loss_uncond": -7.192471027374268}, "model_output": [{"sum_logits": -2.132795572280884, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": true, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -2.132795572280884, "logits_per_char": -0.4265591144561768, "num_chars": 5}, {"sum_logits": -3.7783260345458984, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -12.056524276733398, "logits_per_token": -3.7783260345458984, "logits_per_char": -0.6297210057576498, "num_chars": 6}, {"sum_logits": -4.314830780029297, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -4.314830780029297, "logits_per_char": -0.7191384633382162, "num_chars": 6}, {"sum_logits": -5.077019214630127, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -12.802070617675781, "logits_per_token": -5.077019214630127, "logits_per_char": -1.0154038429260255, "num_chars": 5}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 325, "native_id": "Mercury_7160563", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 35.7067756652832, "incorrect_loss_raw": 26.42552312215169, "correct_loss_per_char": 0.7001328561820236, "incorrect_loss_per_char": 0.5558314217461481, "correct_loss_per_token": 2.9755646387736, "incorrect_loss_per_token": 3.251196310255262, "correct_loss_uncond": -9.25558090209961, "incorrect_loss_uncond": -10.53195063273112}, "model_output": [{"sum_logits": -16.517169952392578, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -30.24736785888672, "logits_per_token": -3.3034339904785157, "logits_per_char": -0.4588102764553494, "num_chars": 36}, {"sum_logits": -29.28491973876953, "num_tokens": 8, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -37.538475036621094, "logits_per_token": -3.6606149673461914, "logits_per_char": -0.6507759941948785, "num_chars": 45}, {"sum_logits": -35.7067756652832, "num_tokens": 12, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -44.96235656738281, "logits_per_token": -2.9755646387736, "logits_per_char": -0.7001328561820236, "num_chars": 51}, {"sum_logits": -33.47447967529297, "num_tokens": 12, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -43.086578369140625, "logits_per_token": -2.7895399729410806, "logits_per_char": -0.5579079945882162, "num_chars": 60}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 326, "native_id": "Mercury_7068583", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.304582595825195, "incorrect_loss_raw": 10.116572380065918, "correct_loss_per_char": 0.31203427034265857, "incorrect_loss_per_char": 0.6893757251130433, "correct_loss_per_token": 2.6522912979125977, "incorrect_loss_per_token": 5.058286190032959, "correct_loss_uncond": -14.566953659057617, "incorrect_loss_uncond": -9.118470191955566}, "model_output": [{"sum_logits": -11.13259506225586, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.85094451904297, "logits_per_token": -5.56629753112793, "logits_per_char": -0.7951853615897042, "num_chars": 14}, {"sum_logits": -5.304582595825195, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -19.871536254882812, "logits_per_token": -2.6522912979125977, "logits_per_char": -0.31203427034265857, "num_chars": 17}, {"sum_logits": -11.306814193725586, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -21.018354415893555, "logits_per_token": -5.653407096862793, "logits_per_char": -0.8076295852661133, "num_chars": 14}, {"sum_logits": -7.910307884216309, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -18.83582878112793, "logits_per_token": -3.9551539421081543, "logits_per_char": -0.4653122284833123, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 327, "native_id": "Mercury_404638", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.911343097686768, "incorrect_loss_raw": 8.339489777882894, "correct_loss_per_char": 0.5911343097686768, "incorrect_loss_per_char": 0.4766806989561568, "correct_loss_per_token": 2.955671548843384, "incorrect_loss_per_token": 1.9912039544847275, "correct_loss_uncond": -11.55495023727417, "incorrect_loss_uncond": -17.773533980051678}, "model_output": [{"sum_logits": -5.090765953063965, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.565811157226562, "logits_per_token": -1.018153190612793, "logits_per_char": -0.2679350501612613, "num_chars": 19}, {"sum_logits": -12.653318405151367, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -26.264270782470703, "logits_per_token": -2.5306636810302736, "logits_per_char": -0.6025389716738746, "num_chars": 21}, {"sum_logits": -7.27438497543335, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -20.508989334106445, "logits_per_token": -2.4247949918111167, "logits_per_char": -0.5595680750333346, "num_chars": 13}, {"sum_logits": -5.911343097686768, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.466293334960938, "logits_per_token": -2.955671548843384, "logits_per_char": -0.5911343097686768, "num_chars": 10}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 328, "native_id": "Mercury_SC_407138", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.97188949584961, "incorrect_loss_raw": 22.75406010945638, "correct_loss_per_char": 0.9248847961425781, "incorrect_loss_per_char": 0.8303764703202289, "correct_loss_per_token": 3.121486186981201, "incorrect_loss_per_token": 3.407723593333411, "correct_loss_uncond": -10.738208770751953, "incorrect_loss_uncond": -4.350873311360677}, "model_output": [{"sum_logits": -24.97188949584961, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.71009826660156, "logits_per_token": -3.121486186981201, "logits_per_char": -0.9248847961425781, "num_chars": 27}, {"sum_logits": -19.800090789794922, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.23061752319336, "logits_per_token": -3.300015131632487, "logits_per_char": -0.7333366959183304, "num_chars": 27}, {"sum_logits": -28.04721450805664, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.277219772338867, "logits_per_token": -4.006744929722378, "logits_per_char": -1.0016862324305944, "num_chars": 28}, {"sum_logits": -20.414875030517578, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -22.806962966918945, "logits_per_token": -2.9164107186453685, "logits_per_char": -0.7561064826117622, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 329, "native_id": "MCAS_2000_4_10", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.580769062042236, "incorrect_loss_raw": 3.693230946858724, "correct_loss_per_char": 1.395192265510559, "incorrect_loss_per_char": 0.6222357757507808, "correct_loss_per_token": 5.580769062042236, "incorrect_loss_per_token": 3.693230946858724, "correct_loss_uncond": -7.339505672454834, "incorrect_loss_uncond": -11.828562418619791}, "model_output": [{"sum_logits": -3.673950672149658, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -14.512763977050781, "logits_per_token": -3.673950672149658, "logits_per_char": -0.7347901344299317, "num_chars": 5}, {"sum_logits": -4.299673080444336, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.063583374023438, "logits_per_token": -4.299673080444336, "logits_per_char": -0.614239011492048, "num_chars": 7}, {"sum_logits": -3.1060690879821777, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -14.989032745361328, "logits_per_token": -3.1060690879821777, "logits_per_char": -0.5176781813303629, "num_chars": 6}, {"sum_logits": -5.580769062042236, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -12.92027473449707, "logits_per_token": -5.580769062042236, "logits_per_char": -1.395192265510559, "num_chars": 4}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 330, "native_id": "Mercury_177748", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.825355529785156, "incorrect_loss_raw": 3.605169693628947, "correct_loss_per_char": 1.117907932826451, "incorrect_loss_per_char": 0.45852788510145964, "correct_loss_per_token": 7.825355529785156, "incorrect_loss_per_token": 3.605169693628947, "correct_loss_uncond": -5.605432510375977, "incorrect_loss_uncond": -9.790710687637329}, "model_output": [{"sum_logits": -2.2862508296966553, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": true, "sum_logits_uncond": -11.46108627319336, "logits_per_token": -2.2862508296966553, "logits_per_char": -0.38104180494944256, "num_chars": 6}, {"sum_logits": -3.3729472160339355, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.935014724731445, "logits_per_token": -3.3729472160339355, "logits_per_char": -0.42161840200424194, "num_chars": 8}, {"sum_logits": -5.15631103515625, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.791540145874023, "logits_per_token": -5.15631103515625, "logits_per_char": -0.5729234483506944, "num_chars": 9}, {"sum_logits": -7.825355529785156, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.430788040161133, "logits_per_token": -7.825355529785156, "logits_per_char": -1.117907932826451, "num_chars": 7}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 331, "native_id": "MCAS_2004_9_21-v1", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 17.091670989990234, "incorrect_loss_raw": 29.23922602335612, "correct_loss_per_char": 0.258964711969549, "incorrect_loss_per_char": 0.5170083075374177, "correct_loss_per_token": 1.4243059158325195, "incorrect_loss_per_token": 2.6069922610049407, "correct_loss_uncond": -17.231708526611328, "incorrect_loss_uncond": -12.589168548583984}, "model_output": [{"sum_logits": -31.40286636352539, "num_tokens": 11, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -42.418846130371094, "logits_per_token": -2.8548060330477627, "logits_per_char": -0.6039012762216421, "num_chars": 52}, {"sum_logits": -23.780841827392578, "num_tokens": 9, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -41.66046905517578, "logits_per_token": -2.6423157585991754, "logits_per_char": -0.4756168365478516, "num_chars": 50}, {"sum_logits": -17.091670989990234, "num_tokens": 12, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -34.32337951660156, "logits_per_token": -1.4243059158325195, "logits_per_char": -0.258964711969549, "num_chars": 66}, {"sum_logits": -32.53396987915039, "num_tokens": 14, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -41.40586853027344, "logits_per_token": -2.323854991367885, "logits_per_char": -0.4715068098427593, "num_chars": 69}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 332, "native_id": "MDSA_2007_5_16", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.063088417053223, "incorrect_loss_raw": 7.365751425425212, "correct_loss_per_char": 0.28252353668212893, "incorrect_loss_per_char": 0.6232475004498921, "correct_loss_per_token": 3.5315442085266113, "incorrect_loss_per_token": 4.971273024876912, "correct_loss_uncond": -13.899147987365723, "incorrect_loss_uncond": -7.887982209523519}, "model_output": [{"sum_logits": -7.063088417053223, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -20.962236404418945, "logits_per_token": -3.5315442085266113, "logits_per_char": -0.28252353668212893, "num_chars": 25}, {"sum_logits": -8.444947242736816, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -17.04598617553711, "logits_per_token": -4.222473621368408, "logits_per_char": -0.6032105173383441, "num_chars": 14}, {"sum_logits": -5.9219231605529785, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -15.626591682434082, "logits_per_token": -2.9609615802764893, "logits_per_char": -0.49349359671274823, "num_chars": 12}, {"sum_logits": -7.73038387298584, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -13.088623046875, "logits_per_token": -7.73038387298584, "logits_per_char": -0.773038387298584, "num_chars": 10}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 333, "native_id": "Mercury_401763", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.230093002319336, "incorrect_loss_raw": 19.648452758789062, "correct_loss_per_char": 1.602507750193278, "incorrect_loss_per_char": 2.0917662337974265, "correct_loss_per_token": 2.403761625289917, "incorrect_loss_per_token": 3.2113658799065483, "correct_loss_uncond": -9.359659194946289, "incorrect_loss_uncond": -11.749385197957357}, "model_output": [{"sum_logits": -17.691967010498047, "num_tokens": 5, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -26.972869873046875, "logits_per_token": -3.5383934020996093, "logits_per_char": -2.211495876312256, "num_chars": 8}, {"sum_logits": -19.230093002319336, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -28.589752197265625, "logits_per_token": -2.403761625289917, "logits_per_char": -1.602507750193278, "num_chars": 12}, {"sum_logits": -22.536727905273438, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -37.482566833496094, "logits_per_token": -3.7561213175455728, "logits_per_char": -2.504080878363715, "num_chars": 9}, {"sum_logits": -18.716663360595703, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -29.73807716369629, "logits_per_token": -2.339582920074463, "logits_per_char": -1.5597219467163086, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 334, "native_id": "Mercury_7268118", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.26160430908203, "incorrect_loss_raw": 23.0857728322347, "correct_loss_per_char": 0.6325804766486672, "incorrect_loss_per_char": 0.4650720654468105, "correct_loss_per_token": 2.9328731190074575, "incorrect_loss_per_token": 3.065326599847703, "correct_loss_uncond": -9.230613708496094, "incorrect_loss_uncond": -18.85774803161621}, "model_output": [{"sum_logits": -19.130722045898438, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -43.348941802978516, "logits_per_token": -2.3913402557373047, "logits_per_char": -0.3904228988958865, "num_chars": 49}, {"sum_logits": -19.952957153320312, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -43.24854278564453, "logits_per_token": -2.494119644165039, "logits_per_char": -0.42453100326213433, "num_chars": 47}, {"sum_logits": -32.26160430908203, "num_tokens": 11, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -41.492218017578125, "logits_per_token": -2.9328731190074575, "logits_per_char": -0.6325804766486672, "num_chars": 51}, {"sum_logits": -30.17363929748535, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -39.23307800292969, "logits_per_token": -4.310519899640765, "logits_per_char": -0.5802622941824106, "num_chars": 52}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 335, "native_id": "Mercury_403232", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.794546127319336, "incorrect_loss_raw": 5.175024509429932, "correct_loss_per_char": 0.6495455106099447, "incorrect_loss_per_char": 0.44548841678734985, "correct_loss_per_token": 2.598182042439779, "incorrect_loss_per_token": 1.7250081698099773, "correct_loss_uncond": -10.702836990356445, "incorrect_loss_uncond": -11.519854386647543}, "model_output": [{"sum_logits": -5.055032253265381, "num_tokens": 3, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -17.596782684326172, "logits_per_token": -1.6850107510884602, "logits_per_char": -0.42125268777211505, "num_chars": 12}, {"sum_logits": -5.637604236602783, "num_tokens": 3, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -16.10842514038086, "logits_per_token": -1.8792014122009277, "logits_per_char": -0.5125094760547985, "num_chars": 11}, {"sum_logits": -4.832437038421631, "num_tokens": 3, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -16.37942886352539, "logits_per_token": -1.6108123461405437, "logits_per_char": -0.4027030865351359, "num_chars": 12}, {"sum_logits": -7.794546127319336, "num_tokens": 3, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -18.49738311767578, "logits_per_token": -2.598182042439779, "logits_per_char": -0.6495455106099447, "num_chars": 12}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 336, "native_id": "Mercury_415081", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 6.479570388793945, "incorrect_loss_raw": 7.579998016357422, "correct_loss_per_char": 0.8099462985992432, "incorrect_loss_per_char": 0.9854697329657419, "correct_loss_per_token": 1.0799283981323242, "incorrect_loss_per_token": 1.4405262470245361, "correct_loss_uncond": -10.758468627929688, "incorrect_loss_uncond": -10.267597198486328}, "model_output": [{"sum_logits": -6.479570388793945, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.238039016723633, "logits_per_token": -1.0799283981323242, "logits_per_char": -0.8099462985992432, "num_chars": 8}, {"sum_logits": -8.471122741699219, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.48028564453125, "logits_per_token": -1.4118537902832031, "logits_per_char": -1.0588903427124023, "num_chars": 8}, {"sum_logits": -7.889914512634277, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -18.393613815307617, "logits_per_token": -1.314985752105713, "logits_per_char": -0.9862393140792847, "num_chars": 8}, {"sum_logits": -6.3789567947387695, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -17.668886184692383, "logits_per_token": -1.5947391986846924, "logits_per_char": -0.9112795421055385, "num_chars": 7}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 337, "native_id": "Mercury_7206378", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.891767501831055, "incorrect_loss_raw": 26.549253463745117, "correct_loss_per_char": 0.5827260366300258, "incorrect_loss_per_char": 0.49897423027981264, "correct_loss_per_token": 3.4131096431187222, "incorrect_loss_per_token": 2.828151279025608, "correct_loss_uncond": -19.988367080688477, "incorrect_loss_uncond": -21.65956179300944}, "model_output": [{"sum_logits": -22.1815242767334, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -44.563323974609375, "logits_per_token": -2.4646138085259333, "logits_per_char": -0.4929227617051866, "num_chars": 45}, {"sum_logits": -23.891767501831055, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -43.88013458251953, "logits_per_token": -3.4131096431187222, "logits_per_char": -0.5827260366300258, "num_chars": 41}, {"sum_logits": -24.5894775390625, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -48.33620834350586, "logits_per_token": -2.7321641710069446, "logits_per_char": -0.48214661841299017, "num_chars": 51}, {"sum_logits": -32.87675857543945, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -51.72691345214844, "logits_per_token": -3.2876758575439453, "logits_per_char": -0.5218533107212612, "num_chars": 63}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 338, "native_id": "CSZ30169", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.483495712280273, "incorrect_loss_raw": 16.49336115519206, "correct_loss_per_char": 0.7216109523066768, "incorrect_loss_per_char": 0.6371997132079441, "correct_loss_per_token": 2.435436964035034, "incorrect_loss_per_token": 2.237517447698684, "correct_loss_uncond": -7.370658874511719, "incorrect_loss_uncond": -9.927766799926758}, "model_output": [{"sum_logits": -19.483495712280273, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.854154586791992, "logits_per_token": -2.435436964035034, "logits_per_char": -0.7216109523066768, "num_chars": 27}, {"sum_logits": -19.93773651123047, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -30.385107040405273, "logits_per_token": -2.4922170639038086, "logits_per_char": -0.7975094604492188, "num_chars": 25}, {"sum_logits": -15.552412986755371, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -24.39860725402832, "logits_per_token": -2.2217732838221957, "logits_per_char": -0.5760152958057545, "num_chars": 27}, {"sum_logits": -13.989933967590332, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -24.47966957092285, "logits_per_token": -1.9985619953700475, "logits_per_char": -0.5380743833688589, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 339, "native_id": "Mercury_7013948", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.633138656616211, "incorrect_loss_raw": 15.86557420094808, "correct_loss_per_char": 0.4434284441398852, "incorrect_loss_per_char": 0.559305347896866, "correct_loss_per_token": 3.6582846641540527, "incorrect_loss_per_token": 3.6702157497406005, "correct_loss_uncond": -12.213571548461914, "incorrect_loss_uncond": -11.419262250264486}, "model_output": [{"sum_logits": -15.624696731567383, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -27.08989715576172, "logits_per_token": -3.9061741828918457, "logits_per_char": -0.5786924715395327, "num_chars": 27}, {"sum_logits": -17.770668029785156, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.22947120666504, "logits_per_token": -3.5541336059570314, "logits_per_char": -0.5732473557995211, "num_chars": 31}, {"sum_logits": -14.2013578414917, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -24.535140991210938, "logits_per_token": -3.550339460372925, "logits_per_char": -0.5259762163515445, "num_chars": 27}, {"sum_logits": -14.633138656616211, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.846710205078125, "logits_per_token": -3.6582846641540527, "logits_per_char": -0.4434284441398852, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 340, "native_id": "Mercury_SC_402164", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.8068132400512695, "incorrect_loss_raw": 8.960231939951578, "correct_loss_per_char": 0.40056777000427246, "incorrect_loss_per_char": 0.8711020468862771, "correct_loss_per_token": 4.8068132400512695, "incorrect_loss_per_token": 8.960231939951578, "correct_loss_uncond": -9.286526679992676, "incorrect_loss_uncond": -5.032349109649658}, "model_output": [{"sum_logits": -10.092622756958008, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.102094650268555, "logits_per_token": -10.092622756958008, "logits_per_char": -1.1214025285508897, "num_chars": 9}, {"sum_logits": -12.262473106384277, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -15.053333282470703, "logits_per_token": -12.262473106384277, "logits_per_char": -1.1147702823985706, "num_chars": 11}, {"sum_logits": -4.8068132400512695, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -14.093339920043945, "logits_per_token": -4.8068132400512695, "logits_per_char": -0.40056777000427246, "num_chars": 12}, {"sum_logits": -4.525599956512451, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -14.822315216064453, "logits_per_token": -4.525599956512451, "logits_per_char": -0.3771333297093709, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 341, "native_id": "Mercury_400880", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 9.559170722961426, "incorrect_loss_raw": 10.66812515258789, "correct_loss_per_char": 0.47795853614807127, "incorrect_loss_per_char": 0.5958951052497415, "correct_loss_per_token": 2.3897926807403564, "incorrect_loss_per_token": 2.6670312881469727, "correct_loss_uncond": -12.425543785095215, "incorrect_loss_uncond": -14.772596995035807}, "model_output": [{"sum_logits": -9.94207763671875, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.906417846679688, "logits_per_token": -2.4855194091796875, "logits_per_char": -0.5848280962775735, "num_chars": 17}, {"sum_logits": -11.304130554199219, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.736061096191406, "logits_per_token": -2.8260326385498047, "logits_per_char": -0.6649488561293658, "num_chars": 17}, {"sum_logits": -9.559170722961426, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.98471450805664, "logits_per_token": -2.3897926807403564, "logits_per_char": -0.47795853614807127, "num_chars": 20}, {"sum_logits": -10.758167266845703, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.6796875, "logits_per_token": -2.689541816711426, "logits_per_char": -0.5379083633422852, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 342, "native_id": "Mercury_7040793", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.087676048278809, "incorrect_loss_raw": 9.679872671763102, "correct_loss_per_char": 0.32040400254098994, "incorrect_loss_per_char": 0.3627834336630771, "correct_loss_per_token": 2.0292253494262695, "incorrect_loss_per_token": 1.873451110294887, "correct_loss_uncond": -17.413229942321777, "incorrect_loss_uncond": -19.477503299713135}, "model_output": [{"sum_logits": -4.554346561431885, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.9061336517334, "logits_per_token": -1.518115520477295, "logits_per_char": -0.26790273890775795, "num_chars": 17}, {"sum_logits": -6.087676048278809, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.500905990600586, "logits_per_token": -2.0292253494262695, "logits_per_char": -0.32040400254098994, "num_chars": 19}, {"sum_logits": -10.575983047485352, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.88988494873047, "logits_per_token": -2.1151966094970702, "logits_per_char": -0.42303932189941407, "num_chars": 25}, {"sum_logits": -13.90928840637207, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.676109313964844, "logits_per_token": -1.9870412009102958, "logits_per_char": -0.39740824018205917, "num_chars": 35}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 343, "native_id": "MDSA_2010_5_29", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 20.910476684570312, "incorrect_loss_raw": 16.32387701670329, "correct_loss_per_char": 0.240350306719199, "incorrect_loss_per_char": 0.3624977227179437, "correct_loss_per_token": 1.3069047927856445, "incorrect_loss_per_token": 2.1990934875276356, "correct_loss_uncond": -32.78278350830078, "incorrect_loss_uncond": -26.33708922068278}, "model_output": [{"sum_logits": -11.897590637207031, "num_tokens": 4, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -35.51030731201172, "logits_per_token": -2.974397659301758, "logits_per_char": -0.4406515050817419, "num_chars": 27}, {"sum_logits": -12.535250663757324, "num_tokens": 6, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -33.24836349487305, "logits_per_token": -2.089208443959554, "logits_per_char": -0.358150018964495, "num_chars": 35}, {"sum_logits": -24.538789749145508, "num_tokens": 16, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -59.22422790527344, "logits_per_token": -1.5336743593215942, "logits_per_char": -0.28869164410759424, "num_chars": 85}, {"sum_logits": -20.910476684570312, "num_tokens": 16, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -53.693260192871094, "logits_per_token": -1.3069047927856445, "logits_per_char": -0.240350306719199, "num_chars": 87}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 344, "native_id": "LEAP__8_10365", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.38613510131836, "incorrect_loss_raw": 29.37297248840332, "correct_loss_per_char": 0.6330444812774658, "incorrect_loss_per_char": 0.6063166206704694, "correct_loss_per_token": 2.0257423400878904, "incorrect_loss_per_token": 2.277055813864209, "correct_loss_uncond": -24.302608489990234, "incorrect_loss_uncond": -23.261512120564777}, "model_output": [{"sum_logits": -30.38613510131836, "num_tokens": 15, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -54.688743591308594, "logits_per_token": -2.0257423400878904, "logits_per_char": -0.6330444812774658, "num_chars": 48}, {"sum_logits": -27.934551239013672, "num_tokens": 11, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -49.80530548095703, "logits_per_token": -2.539504658092152, "logits_per_char": -0.6983637809753418, "num_chars": 40}, {"sum_logits": -27.238740921020508, "num_tokens": 13, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -54.52492904663086, "logits_per_token": -2.0952877631554236, "logits_per_char": -0.46167357493255096, "num_chars": 59}, {"sum_logits": -32.94562530517578, "num_tokens": 15, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -53.573219299316406, "logits_per_token": -2.196375020345052, "logits_per_char": -0.6589125061035156, "num_chars": 50}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 345, "native_id": "Mercury_SC_401295", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.82328224182129, "incorrect_loss_raw": 26.779067993164062, "correct_loss_per_char": 0.4196785046504094, "incorrect_loss_per_char": 0.5142728469332574, "correct_loss_per_token": 1.9839347492564807, "incorrect_loss_per_token": 2.515194621230617, "correct_loss_uncond": -22.766084671020508, "incorrect_loss_uncond": -16.643834431966145}, "model_output": [{"sum_logits": -26.64218521118164, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -37.88724899291992, "logits_per_token": -2.664218521118164, "logits_per_char": -0.5920485602484808, "num_chars": 45}, {"sum_logits": -21.82328224182129, "num_tokens": 11, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -44.5893669128418, "logits_per_token": -1.9839347492564807, "logits_per_char": -0.4196785046504094, "num_chars": 52}, {"sum_logits": -27.314144134521484, "num_tokens": 11, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -48.61813735961914, "logits_per_token": -2.483104012229226, "logits_per_char": -0.5252720025869516, "num_chars": 52}, {"sum_logits": -26.380874633789062, "num_tokens": 11, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -43.76332092285156, "logits_per_token": -2.3982613303444604, "logits_per_char": -0.4254979779643397, "num_chars": 62}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 346, "native_id": "MCAS_2012_5_23625", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.895137786865234, "incorrect_loss_raw": 11.095920244852701, "correct_loss_per_char": 0.9938316345214844, "incorrect_loss_per_char": 0.6195338161395457, "correct_loss_per_token": 4.223784446716309, "incorrect_loss_per_token": 2.7739800612131753, "correct_loss_uncond": -14.189945220947266, "incorrect_loss_uncond": -17.204819679260254}, "model_output": [{"sum_logits": -10.711038589477539, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.21366310119629, "logits_per_token": -2.6777596473693848, "logits_per_char": -0.5950576994154189, "num_chars": 18}, {"sum_logits": -10.416544914245605, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.553686141967773, "logits_per_token": -2.6041362285614014, "logits_per_char": -0.5482392060129266, "num_chars": 19}, {"sum_logits": -16.895137786865234, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.0850830078125, "logits_per_token": -4.223784446716309, "logits_per_char": -0.9938316345214844, "num_chars": 17}, {"sum_logits": -12.160177230834961, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.134870529174805, "logits_per_token": -3.0400443077087402, "logits_per_char": -0.7153045429902918, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 347, "native_id": "Mercury_7268048", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.71431827545166, "incorrect_loss_raw": 18.154743194580078, "correct_loss_per_char": 0.42381060918172203, "incorrect_loss_per_char": 0.5712581615255337, "correct_loss_per_token": 2.542863655090332, "incorrect_loss_per_token": 2.813417540656195, "correct_loss_uncond": -6.8033552169799805, "incorrect_loss_uncond": -7.285991032918294}, "model_output": [{"sum_logits": -20.903284072875977, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -28.524932861328125, "logits_per_token": -3.4838806788126626, "logits_per_char": -0.6967761357625325, "num_chars": 30}, {"sum_logits": -18.270090103149414, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.0422306060791, "logits_per_token": -3.045015017191569, "logits_per_char": -0.5536390940348307, "num_chars": 33}, {"sum_logits": -12.71431827545166, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -19.51767349243164, "logits_per_token": -2.542863655090332, "logits_per_char": -0.42381060918172203, "num_chars": 30}, {"sum_logits": -15.290855407714844, "num_tokens": 8, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -21.75503921508789, "logits_per_token": -1.9113569259643555, "logits_per_char": -0.4633592547792377, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 348, "native_id": "Mercury_SC_402629", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.132843494415283, "incorrect_loss_raw": 6.463849862416585, "correct_loss_per_char": 0.3916054368019104, "incorrect_loss_per_char": 0.5167140566117582, "correct_loss_per_token": 3.132843494415283, "incorrect_loss_per_token": 3.2319249312082925, "correct_loss_uncond": -9.900203227996826, "incorrect_loss_uncond": -12.080065886179606}, "model_output": [{"sum_logits": -3.132843494415283, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -13.03304672241211, "logits_per_token": -3.132843494415283, "logits_per_char": -0.3916054368019104, "num_chars": 8}, {"sum_logits": -6.669572830200195, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.0288028717041, "logits_per_token": -3.3347864151000977, "logits_per_char": -0.6669572830200196, "num_chars": 10}, {"sum_logits": -6.95749044418335, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -15.478137016296387, "logits_per_token": -3.478745222091675, "logits_per_char": -0.5797908703486124, "num_chars": 12}, {"sum_logits": -5.764486312866211, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.124807357788086, "logits_per_token": -2.8822431564331055, "logits_per_char": -0.30339401646664266, "num_chars": 19}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 349, "native_id": "NCEOGA_2013_8_42", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.753210067749023, "incorrect_loss_raw": 21.87054506937663, "correct_loss_per_char": 0.617287814617157, "incorrect_loss_per_char": 0.6527710130407639, "correct_loss_per_token": 3.9506420135498046, "incorrect_loss_per_token": 3.4126340411958243, "correct_loss_uncond": -12.848909378051758, "incorrect_loss_uncond": -14.146123886108398}, "model_output": [{"sum_logits": -15.134199142456055, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.205402374267578, "logits_per_token": -3.026839828491211, "logits_per_char": -0.5044733047485351, "num_chars": 30}, {"sum_logits": -21.11752700805664, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -41.1475830078125, "logits_per_token": -3.01678957257952, "logits_per_char": -0.6812105486469884, "num_chars": 31}, {"sum_logits": -29.359909057617188, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -39.697021484375, "logits_per_token": -4.194272722516741, "logits_per_char": -0.7726291857267681, "num_chars": 38}, {"sum_logits": -19.753210067749023, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -32.60211944580078, "logits_per_token": -3.9506420135498046, "logits_per_char": -0.617287814617157, "num_chars": 32}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 350, "native_id": "Mercury_412463", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.8815507888793945, "incorrect_loss_raw": 4.637479941050212, "correct_loss_per_char": 2.4407753944396973, "incorrect_loss_per_char": 2.318739970525106, "correct_loss_per_token": 4.8815507888793945, "incorrect_loss_per_token": 4.637479941050212, "correct_loss_uncond": -1.4919891357421875, "incorrect_loss_uncond": -0.9461545944213867}, "model_output": [{"sum_logits": -5.275396347045898, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -6.525059700012207, "logits_per_token": -5.275396347045898, "logits_per_char": -2.637698173522949, "num_chars": 2}, {"sum_logits": -4.3232316970825195, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -4.448035717010498, "logits_per_token": -4.3232316970825195, "logits_per_char": -2.1616158485412598, "num_chars": 2}, {"sum_logits": -4.313811779022217, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -4.313811779022217, "logits_per_char": -2.1569058895111084, "num_chars": 2}, {"sum_logits": -4.8815507888793945, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -6.373539924621582, "logits_per_token": -4.8815507888793945, "logits_per_char": -2.4407753944396973, "num_chars": 2}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 351, "native_id": "Mercury_409295", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.448278427124023, "incorrect_loss_raw": 29.635616302490234, "correct_loss_per_char": 0.6465517824346368, "incorrect_loss_per_char": 0.6909819042133445, "correct_loss_per_token": 3.1609198252360025, "incorrect_loss_per_token": 3.3173181131059244, "correct_loss_uncond": -6.417497634887695, "incorrect_loss_uncond": -5.843120574951172}, "model_output": [{"sum_logits": -30.586204528808594, "num_tokens": 11, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -39.61305618286133, "logits_per_token": -2.7805640480735083, "logits_per_char": -0.6372125943501791, "num_chars": 48}, {"sum_logits": -28.448278427124023, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -34.86577606201172, "logits_per_token": -3.1609198252360025, "logits_per_char": -0.6465517824346368, "num_chars": 44}, {"sum_logits": -21.776538848876953, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.390811920166016, "logits_per_token": -3.1109341212681363, "logits_per_char": -0.5444134712219239, "num_chars": 40}, {"sum_logits": -36.544105529785156, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -39.432342529296875, "logits_per_token": -4.060456169976129, "logits_per_char": -0.8913196470679307, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 352, "native_id": "Mercury_404609", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.475631713867188, "incorrect_loss_raw": 14.445858001708984, "correct_loss_per_char": 0.7488923506303267, "incorrect_loss_per_char": 0.9009270969190096, "correct_loss_per_token": 4.118907928466797, "incorrect_loss_per_token": 4.891590701209174, "correct_loss_uncond": -12.842586517333984, "incorrect_loss_uncond": -13.065300623575846}, "model_output": [{"sum_logits": -9.847402572631836, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -19.88345718383789, "logits_per_token": -4.923701286315918, "logits_per_char": -0.9847402572631836, "num_chars": 10}, {"sum_logits": -16.542335510253906, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.51475715637207, "logits_per_token": -5.514111836751302, "logits_per_char": -0.8706492373817846, "num_chars": 19}, {"sum_logits": -16.94783592224121, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -32.13526153564453, "logits_per_token": -4.236958980560303, "logits_per_char": -0.8473917961120605, "num_chars": 20}, {"sum_logits": -16.475631713867188, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -29.318218231201172, "logits_per_token": -4.118907928466797, "logits_per_char": -0.7488923506303267, "num_chars": 22}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 353, "native_id": "Mercury_7230090", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.473735809326172, "incorrect_loss_raw": 8.720409393310547, "correct_loss_per_char": 0.5338382720947266, "incorrect_loss_per_char": 0.5647916516298971, "correct_loss_per_token": 2.4912452697753906, "incorrect_loss_per_token": 2.882251818974813, "correct_loss_uncond": -8.365765571594238, "incorrect_loss_uncond": -9.59383773803711}, "model_output": [{"sum_logits": -8.452166557312012, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -16.549518585205078, "logits_per_token": -2.113041639328003, "logits_per_char": -0.4695648087395562, "num_chars": 18}, {"sum_logits": -9.283267974853516, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -19.608013153076172, "logits_per_token": -2.320816993713379, "logits_per_char": -0.663090569632394, "num_chars": 14}, {"sum_logits": -7.473735809326172, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -15.83950138092041, "logits_per_token": -2.4912452697753906, "logits_per_char": -0.5338382720947266, "num_chars": 14}, {"sum_logits": -8.425793647766113, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -18.78520965576172, "logits_per_token": -4.212896823883057, "logits_per_char": -0.5617195765177408, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 354, "native_id": "Mercury_7057488", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.5483341217041, "incorrect_loss_raw": 12.463730176289877, "correct_loss_per_char": 0.5476356772489326, "incorrect_loss_per_char": 0.3871332742234337, "correct_loss_per_token": 2.9435417652130127, "incorrect_loss_per_token": 1.75479097366333, "correct_loss_uncond": -16.969274520874023, "incorrect_loss_uncond": -20.158947308858234}, "model_output": [{"sum_logits": -8.965872764587402, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -29.71195411682129, "logits_per_token": -1.4943121274312336, "logits_per_char": -0.3586349105834961, "num_chars": 25}, {"sum_logits": -14.512382507324219, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.53062438964844, "logits_per_token": -1.451238250732422, "logits_per_char": -0.35396054895912726, "num_chars": 41}, {"sum_logits": -13.912935256958008, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -34.62545394897461, "logits_per_token": -2.3188225428263345, "logits_per_char": -0.4488043631276777, "num_chars": 31}, {"sum_logits": -23.5483341217041, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -40.517608642578125, "logits_per_token": -2.9435417652130127, "logits_per_char": -0.5476356772489326, "num_chars": 43}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 355, "native_id": "MDSA_2009_4_1", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.003678321838379, "incorrect_loss_raw": 8.486043135325113, "correct_loss_per_char": 0.29433401893166933, "incorrect_loss_per_char": 0.6688851084974078, "correct_loss_per_token": 1.6678927739461262, "incorrect_loss_per_token": 2.8286810451083717, "correct_loss_uncond": -18.204970359802246, "incorrect_loss_uncond": -10.741663773854574}, "model_output": [{"sum_logits": -10.190649032592773, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -19.40886116027832, "logits_per_token": -3.396883010864258, "logits_per_char": -0.8492207527160645, "num_chars": 12}, {"sum_logits": -9.75441837310791, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -19.790647506713867, "logits_per_token": -3.25147279103597, "logits_per_char": -0.8128681977589926, "num_chars": 12}, {"sum_logits": -5.513062000274658, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -18.483612060546875, "logits_per_token": -1.837687333424886, "logits_per_char": -0.34456637501716614, "num_chars": 16}, {"sum_logits": -5.003678321838379, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.208648681640625, "logits_per_token": -1.6678927739461262, "logits_per_char": -0.29433401893166933, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 356, "native_id": "Mercury_7150728", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 24.87773895263672, "incorrect_loss_raw": 37.069920221964516, "correct_loss_per_char": 0.3658491022446576, "incorrect_loss_per_char": 0.6581786093790934, "correct_loss_per_token": 2.7641932169596353, "incorrect_loss_per_token": 3.6126110968087133, "correct_loss_uncond": -12.603992462158203, "incorrect_loss_uncond": -13.258207321166992}, "model_output": [{"sum_logits": -24.87773895263672, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -37.48173141479492, "logits_per_token": -2.7641932169596353, "logits_per_char": -0.3658491022446576, "num_chars": 68}, {"sum_logits": -40.03139114379883, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -50.89847946166992, "logits_per_token": -4.447932349310981, "logits_per_char": -0.8006278228759766, "num_chars": 50}, {"sum_logits": -31.543893814086914, "num_tokens": 13, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -48.770660400390625, "logits_per_token": -2.426453370314378, "logits_per_char": -0.5346422680353714, "num_chars": 59}, {"sum_logits": -39.63447570800781, "num_tokens": 10, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -51.315242767333984, "logits_per_token": -3.9634475708007812, "logits_per_char": -0.6392657372259325, "num_chars": 62}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 357, "native_id": "Mercury_402207", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.840887069702148, "incorrect_loss_raw": 22.618107477823894, "correct_loss_per_char": 0.3157635546745138, "incorrect_loss_per_char": 0.4345270904806086, "correct_loss_per_token": 1.4840887069702149, "incorrect_loss_per_token": 1.9768261379665801, "correct_loss_uncond": -23.894216537475586, "incorrect_loss_uncond": -22.00366147359212}, "model_output": [{"sum_logits": -16.557092666625977, "num_tokens": 10, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -38.801795959472656, "logits_per_token": -1.6557092666625977, "logits_per_char": -0.3522785673750208, "num_chars": 47}, {"sum_logits": -14.840887069702148, "num_tokens": 10, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -38.735103607177734, "logits_per_token": -1.4840887069702149, "logits_per_char": -0.3157635546745138, "num_chars": 47}, {"sum_logits": -27.14710235595703, "num_tokens": 12, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -49.227542877197266, "logits_per_token": -2.262258529663086, "logits_per_char": -0.5122094784142837, "num_chars": 53}, {"sum_logits": -24.150127410888672, "num_tokens": 12, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -45.835968017578125, "logits_per_token": -2.012510617574056, "logits_per_char": -0.4390932256525213, "num_chars": 55}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 358, "native_id": "Mercury_411732", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.25924301147461, "incorrect_loss_raw": 19.762353261311848, "correct_loss_per_char": 1.661374500819615, "incorrect_loss_per_char": 1.4797693853413227, "correct_loss_per_token": 3.876540501912435, "incorrect_loss_per_token": 3.2937255435519748, "correct_loss_uncond": -13.131542205810547, "incorrect_loss_uncond": -12.55552864074707}, "model_output": [{"sum_logits": -16.198556900024414, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -28.959421157836914, "logits_per_token": -2.6997594833374023, "logits_per_char": -1.2460428384634166, "num_chars": 13}, {"sum_logits": -21.02375030517578, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -32.41817092895508, "logits_per_token": -3.5039583841959634, "logits_per_char": -1.6172115619365985, "num_chars": 13}, {"sum_logits": -22.06475257873535, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -35.576053619384766, "logits_per_token": -3.6774587631225586, "logits_per_char": -1.5760537556239538, "num_chars": 14}, {"sum_logits": -23.25924301147461, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -36.390785217285156, "logits_per_token": -3.876540501912435, "logits_per_char": -1.661374500819615, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 359, "native_id": "Mercury_7270113", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 23.44861602783203, "incorrect_loss_raw": 28.346415837605793, "correct_loss_per_char": 0.5719174640934642, "incorrect_loss_per_char": 0.7169897682657544, "correct_loss_per_token": 2.931077003479004, "incorrect_loss_per_token": 3.242366396503531, "correct_loss_uncond": -16.60494613647461, "incorrect_loss_uncond": -15.340378443400065}, "model_output": [{"sum_logits": -25.61685562133789, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -42.22565460205078, "logits_per_token": -3.2021069526672363, "logits_per_char": -0.6741277795088919, "num_chars": 38}, {"sum_logits": -37.805477142333984, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -53.14061737060547, "logits_per_token": -3.4368615583939985, "logits_per_char": -0.8592153895984996, "num_chars": 44}, {"sum_logits": -23.44861602783203, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -40.05356216430664, "logits_per_token": -2.931077003479004, "logits_per_char": -0.5719174640934642, "num_chars": 41}, {"sum_logits": -21.616914749145508, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -35.69411087036133, "logits_per_token": -3.088130678449358, "logits_per_char": -0.6176261356898717, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 360, "native_id": "AKDE&ED_2008_8_3", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 39.15321731567383, "incorrect_loss_raw": 38.11628723144531, "correct_loss_per_char": 0.6525536219278971, "incorrect_loss_per_char": 0.6352714538574219, "correct_loss_per_token": 2.796658379690988, "incorrect_loss_per_token": 2.7225919451032365, "correct_loss_uncond": -8.970165252685547, "incorrect_loss_uncond": -9.159996032714844}, "model_output": [{"sum_logits": -38.94677734375, "num_tokens": 14, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -47.97184753417969, "logits_per_token": -2.7819126674107144, "logits_per_char": -0.6491129557291667, "num_chars": 60}, {"sum_logits": -39.15321731567383, "num_tokens": 14, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -48.123382568359375, "logits_per_token": -2.796658379690988, "logits_per_char": -0.6525536219278971, "num_chars": 60}, {"sum_logits": -37.52471160888672, "num_tokens": 14, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -46.549476623535156, "logits_per_token": -2.6803365434919084, "logits_per_char": -0.625411860148112, "num_chars": 60}, {"sum_logits": -37.87737274169922, "num_tokens": 14, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -47.307525634765625, "logits_per_token": -2.7055266244070872, "logits_per_char": -0.631289545694987, "num_chars": 60}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 361, "native_id": "MCAS_1999_8_1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.577903747558594, "incorrect_loss_raw": 15.438114802042643, "correct_loss_per_char": 0.4087884592455487, "incorrect_loss_per_char": 0.44080517456617896, "correct_loss_per_token": 2.5111291067940846, "incorrect_loss_per_token": 2.0519965149107433, "correct_loss_uncond": -15.569931030273438, "incorrect_loss_uncond": -16.163962682088215}, "model_output": [{"sum_logits": -17.577903747558594, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -33.14783477783203, "logits_per_token": -2.5111291067940846, "logits_per_char": -0.4087884592455487, "num_chars": 43}, {"sum_logits": -9.637615203857422, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -25.309795379638672, "logits_per_token": -1.2047019004821777, "logits_per_char": -0.27536043439592633, "num_chars": 35}, {"sum_logits": -16.141725540161133, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -34.00348663330078, "logits_per_token": -2.0177156925201416, "logits_per_char": -0.4247822510568719, "num_chars": 38}, {"sum_logits": -20.535003662109375, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -35.492950439453125, "logits_per_token": -2.9335719517299106, "logits_per_char": -0.6222728382457386, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 362, "native_id": "NYSEDREGENTS_2015_4_24", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.291444778442383, "incorrect_loss_raw": 13.983316898345947, "correct_loss_per_char": 0.8158530573691091, "incorrect_loss_per_char": 0.5397568690559484, "correct_loss_per_token": 3.613063539777483, "incorrect_loss_per_token": 3.633301443523831, "correct_loss_uncond": -7.710317611694336, "incorrect_loss_uncond": -7.148462454477946}, "model_output": [{"sum_logits": -17.062049865722656, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -23.395395278930664, "logits_per_token": -2.843674977620443, "logits_per_char": -0.5018249960506663, "num_chars": 34}, {"sum_logits": -7.337016582489014, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.757347106933594, "logits_per_token": -3.668508291244507, "logits_per_char": -0.3861587674994218, "num_chars": 19}, {"sum_logits": -25.291444778442383, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.00176239013672, "logits_per_token": -3.613063539777483, "logits_per_char": -0.8158530573691091, "num_chars": 31}, {"sum_logits": -17.550884246826172, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -24.242595672607422, "logits_per_token": -4.387721061706543, "logits_per_char": -0.7312868436177572, "num_chars": 24}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 363, "native_id": "Mercury_7122640", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.228624105453491, "incorrect_loss_raw": 5.7149044672648115, "correct_loss_per_char": 0.26905200878779095, "incorrect_loss_per_char": 0.3900361713908968, "correct_loss_per_token": 1.6143120527267456, "incorrect_loss_per_token": 2.8574522336324057, "correct_loss_uncond": -14.41719651222229, "incorrect_loss_uncond": -11.119852383931478}, "model_output": [{"sum_logits": -3.228624105453491, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -17.64582061767578, "logits_per_token": -1.6143120527267456, "logits_per_char": -0.26905200878779095, "num_chars": 12}, {"sum_logits": -5.523841857910156, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -16.041133880615234, "logits_per_token": -2.761920928955078, "logits_per_char": -0.3945601327078683, "num_chars": 14}, {"sum_logits": -5.515317916870117, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -16.107362747192383, "logits_per_token": -2.7576589584350586, "logits_per_char": -0.39395127977643696, "num_chars": 14}, {"sum_logits": -6.10555362701416, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -18.35577392578125, "logits_per_token": -3.05277681350708, "logits_per_char": -0.381597101688385, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 364, "native_id": "Mercury_402547", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.564271926879883, "incorrect_loss_raw": 9.795419534047445, "correct_loss_per_char": 2.760711987813314, "incorrect_loss_per_char": 2.361649778154161, "correct_loss_per_token": 4.141067981719971, "incorrect_loss_per_token": 4.818756898244222, "correct_loss_uncond": -4.891666412353516, "incorrect_loss_uncond": -2.5862178007761636}, "model_output": [{"sum_logits": -6.487636566162109, "num_tokens": 1, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -3.6901819705963135, "logits_per_token": -6.487636566162109, "logits_per_char": -3.2438182830810547, "num_chars": 2}, {"sum_logits": -19.90665054321289, "num_tokens": 4, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -24.415803909301758, "logits_per_token": -4.976662635803223, "logits_per_char": -2.8438072204589844, "num_chars": 7}, {"sum_logits": -16.564271926879883, "num_tokens": 4, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -21.4559383392334, "logits_per_token": -4.141067981719971, "logits_per_char": -2.760711987813314, "num_chars": 6}, {"sum_logits": -2.991971492767334, "num_tokens": 1, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -9.038926124572754, "logits_per_token": -2.991971492767334, "logits_per_char": -0.9973238309224447, "num_chars": 3}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 365, "native_id": "Mercury_7133945", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.456610679626465, "incorrect_loss_raw": 7.683685302734375, "correct_loss_per_char": 0.24802775816483932, "incorrect_loss_per_char": 0.3191302329034456, "correct_loss_per_token": 1.3641526699066162, "incorrect_loss_per_token": 1.5367370605468753, "correct_loss_uncond": -12.365092277526855, "incorrect_loss_uncond": -14.203936258951822}, "model_output": [{"sum_logits": -6.393465042114258, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -20.482254028320312, "logits_per_token": -1.2786930084228516, "logits_per_char": -0.19979578256607056, "num_chars": 32}, {"sum_logits": -5.456610679626465, "num_tokens": 4, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.82170295715332, "logits_per_token": -1.3641526699066162, "logits_per_char": -0.24802775816483932, "num_chars": 22}, {"sum_logits": -8.60312271118164, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -24.074058532714844, "logits_per_token": -1.7206245422363282, "logits_per_char": -0.37404881352963654, "num_chars": 23}, {"sum_logits": -8.054468154907227, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -21.106552124023438, "logits_per_token": -1.6108936309814452, "logits_per_char": -0.38354610261462985, "num_chars": 21}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 366, "native_id": "Mercury_7199028", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.875705242156982, "incorrect_loss_raw": 5.553735176722209, "correct_loss_per_char": 0.5341550220142711, "incorrect_loss_per_char": 0.5108433002574676, "correct_loss_per_token": 2.937852621078491, "incorrect_loss_per_token": 2.7768675883611045, "correct_loss_uncond": -12.736149311065674, "incorrect_loss_uncond": -13.677294333775839}, "model_output": [{"sum_logits": -3.419154405593872, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.91445541381836, "logits_per_token": -1.709577202796936, "logits_per_char": -0.3799060450659858, "num_chars": 9}, {"sum_logits": -5.875705242156982, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.611854553222656, "logits_per_token": -2.937852621078491, "logits_per_char": -0.5341550220142711, "num_chars": 11}, {"sum_logits": -6.758264541625977, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.560623168945312, "logits_per_token": -3.3791322708129883, "logits_per_char": -0.5631887118021647, "num_chars": 12}, {"sum_logits": -6.483786582946777, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.21800994873047, "logits_per_token": -3.2418932914733887, "logits_per_char": -0.5894351439042524, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 367, "native_id": "Mercury_7217298", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.259586334228516, "incorrect_loss_raw": 14.69382413228353, "correct_loss_per_char": 0.5857785877428556, "incorrect_loss_per_char": 0.41814794805314803, "correct_loss_per_token": 3.7099310557047525, "incorrect_loss_per_token": 3.397707176208496, "correct_loss_uncond": -13.106937408447266, "incorrect_loss_uncond": -14.580490112304688}, "model_output": [{"sum_logits": -22.259586334228516, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.36652374267578, "logits_per_token": -3.7099310557047525, "logits_per_char": -0.5857785877428556, "num_chars": 38}, {"sum_logits": -16.544931411743164, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.874080657958984, "logits_per_token": -3.308986282348633, "logits_per_char": -0.3939269383748372, "num_chars": 42}, {"sum_logits": -14.691176414489746, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -29.63338851928711, "logits_per_token": -3.6727941036224365, "logits_per_char": -0.45909926295280457, "num_chars": 32}, {"sum_logits": -12.845364570617676, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -28.315473556518555, "logits_per_token": -3.211341142654419, "logits_per_char": -0.40141764283180237, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 368, "native_id": "Mercury_7057680", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.244726181030273, "incorrect_loss_raw": 15.512151718139648, "correct_loss_per_char": 0.6929420991377397, "incorrect_loss_per_char": 0.8114393268806347, "correct_loss_per_token": 2.540787696838379, "incorrect_loss_per_token": 3.1024303436279297, "correct_loss_uncond": -9.61775016784668, "incorrect_loss_uncond": -8.752264658610025}, "model_output": [{"sum_logits": -17.02914047241211, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.54690933227539, "logits_per_token": -3.405828094482422, "logits_per_char": -0.8514570236206055, "num_chars": 20}, {"sum_logits": -13.739343643188477, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -22.918935775756836, "logits_per_token": -2.7478687286376955, "logits_per_char": -0.5973627670951511, "num_chars": 23}, {"sum_logits": -15.244726181030273, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -24.862476348876953, "logits_per_token": -2.540787696838379, "logits_per_char": -0.6929420991377397, "num_chars": 22}, {"sum_logits": -15.76797103881836, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -22.327404022216797, "logits_per_token": -3.153594207763672, "logits_per_char": -0.9854981899261475, "num_chars": 16}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 369, "native_id": "Mercury_SC_400404", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.21039581298828, "incorrect_loss_raw": 25.641420364379883, "correct_loss_per_char": 1.0065748691558838, "incorrect_loss_per_char": 0.9543902143771515, "correct_loss_per_token": 4.601485116141183, "incorrect_loss_per_token": 4.97152762942844, "correct_loss_uncond": -7.695209503173828, "incorrect_loss_uncond": -5.188442866007487}, "model_output": [{"sum_logits": -25.12647247314453, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.555763244628906, "logits_per_token": -6.281618118286133, "logits_per_char": -1.1421123851429333, "num_chars": 22}, {"sum_logits": -28.865259170532227, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -37.29894256591797, "logits_per_token": -4.810876528422038, "logits_per_char": -0.9020393490791321, "num_chars": 32}, {"sum_logits": -22.93252944946289, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -28.634883880615234, "logits_per_token": -3.8220882415771484, "logits_per_char": -0.819018908909389, "num_chars": 28}, {"sum_logits": -32.21039581298828, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -39.90560531616211, "logits_per_token": -4.601485116141183, "logits_per_char": -1.0065748691558838, "num_chars": 32}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 370, "native_id": "Mercury_SC_408030", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.002885818481445, "incorrect_loss_raw": 14.32059383392334, "correct_loss_per_char": 0.5313401818275452, "incorrect_loss_per_char": 0.33012412651675666, "correct_loss_per_token": 2.1253607273101807, "incorrect_loss_per_token": 1.7303850638172615, "correct_loss_uncond": -14.525468826293945, "incorrect_loss_uncond": -15.22204621632894}, "model_output": [{"sum_logits": -17.002885818481445, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -31.52835464477539, "logits_per_token": -2.1253607273101807, "logits_per_char": -0.5313401818275452, "num_chars": 32}, {"sum_logits": -13.155153274536133, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -26.535587310791016, "logits_per_token": -1.8793076106480189, "logits_per_char": -0.38691627278047447, "num_chars": 34}, {"sum_logits": -11.8639497756958, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -29.030946731567383, "logits_per_token": -1.3182166417439778, "logits_per_char": -0.237278995513916, "num_chars": 50}, {"sum_logits": -17.942678451538086, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -33.06138610839844, "logits_per_token": -1.9936309390597873, "logits_per_char": -0.3661771112558793, "num_chars": 49}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 371, "native_id": "Mercury_415083", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.005951404571533, "incorrect_loss_raw": 5.5448629061381025, "correct_loss_per_char": 1.0009919007619221, "incorrect_loss_per_char": 0.8435385227203369, "correct_loss_per_token": 1.5014878511428833, "incorrect_loss_per_token": 1.3862157265345256, "correct_loss_uncond": -11.886491298675537, "incorrect_loss_uncond": -11.800110975901285}, "model_output": [{"sum_logits": -6.478321552276611, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.0352725982666, "logits_per_token": -1.6195803880691528, "logits_per_char": -1.0797202587127686, "num_chars": 6}, {"sum_logits": -6.005951404571533, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.89244270324707, "logits_per_token": -1.5014878511428833, "logits_per_char": -1.0009919007619221, "num_chars": 6}, {"sum_logits": -4.990553855895996, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.083110809326172, "logits_per_token": -1.247638463973999, "logits_per_char": -0.7129362651279995, "num_chars": 7}, {"sum_logits": -5.165713310241699, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.91653823852539, "logits_per_token": -1.2914283275604248, "logits_per_char": -0.7379590443202427, "num_chars": 7}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 372, "native_id": "Mercury_409114", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.427994728088379, "incorrect_loss_raw": 4.506483634312947, "correct_loss_per_char": 0.19252150991688605, "incorrect_loss_per_char": 0.19003918205482373, "correct_loss_per_token": 1.1069986820220947, "incorrect_loss_per_token": 1.1266209085782368, "correct_loss_uncond": -15.71314525604248, "incorrect_loss_uncond": -15.525174538294474}, "model_output": [{"sum_logits": -3.757514715194702, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": true, "sum_logits_uncond": -19.333175659179688, "logits_per_token": -0.9393786787986755, "logits_per_char": -0.1633702050084653, "num_chars": 23}, {"sum_logits": -4.427994728088379, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -20.14113998413086, "logits_per_token": -1.1069986820220947, "logits_per_char": -0.19252150991688605, "num_chars": 23}, {"sum_logits": -3.7965946197509766, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.62643814086914, "logits_per_token": -0.9491486549377441, "logits_per_char": -0.15819144248962402, "num_chars": 24}, {"sum_logits": -5.965341567993164, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -22.135360717773438, "logits_per_token": -1.491335391998291, "logits_per_char": -0.24855589866638184, "num_chars": 24}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 373, "native_id": "Mercury_SC_415006", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 23.195646286010742, "incorrect_loss_raw": 30.40289052327474, "correct_loss_per_char": 0.6627327510288783, "incorrect_loss_per_char": 0.8956208113063303, "correct_loss_per_token": 2.1086951169100674, "incorrect_loss_per_token": 3.5229104183338307, "correct_loss_uncond": -6.840953826904297, "incorrect_loss_uncond": -4.4080149332682295}, "model_output": [{"sum_logits": -23.195646286010742, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -30.03660011291504, "logits_per_token": -2.1086951169100674, "logits_per_char": -0.6627327510288783, "num_chars": 35}, {"sum_logits": -31.279277801513672, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.69084930419922, "logits_per_token": -3.909909725189209, "logits_per_char": -0.9199787588680491, "num_chars": 34}, {"sum_logits": -28.389068603515625, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -36.08937454223633, "logits_per_token": -3.1543409559461804, "logits_per_char": -0.8111162458147322, "num_chars": 35}, {"sum_logits": -31.540325164794922, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.65249252319336, "logits_per_token": -3.5044805738661022, "logits_per_char": -0.9557674292362097, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 374, "native_id": "MSA_2012_5_15", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.338035583496094, "incorrect_loss_raw": 17.199066162109375, "correct_loss_per_char": 0.4382295880998884, "incorrect_loss_per_char": 0.501521688081971, "correct_loss_per_token": 1.5338035583496095, "incorrect_loss_per_token": 1.7199066162109373, "correct_loss_uncond": -25.933273315429688, "incorrect_loss_uncond": -23.85149637858073}, "model_output": [{"sum_logits": -20.191781997680664, "num_tokens": 10, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -41.820945739746094, "logits_per_token": -2.0191781997680662, "logits_per_char": -0.5938759411082548, "num_chars": 34}, {"sum_logits": -15.935895919799805, "num_tokens": 10, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -37.980247497558594, "logits_per_token": -1.5935895919799805, "logits_per_char": -0.4687028211705825, "num_chars": 34}, {"sum_logits": -15.338035583496094, "num_tokens": 10, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -41.27130889892578, "logits_per_token": -1.5338035583496095, "logits_per_char": -0.4382295880998884, "num_chars": 35}, {"sum_logits": -15.469520568847656, "num_tokens": 10, "num_tokens_all": 278, "is_greedy": false, "sum_logits_uncond": -43.350494384765625, "logits_per_token": -1.5469520568847657, "logits_per_char": -0.4419863019670759, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 375, "native_id": "Mercury_SC_402612", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 29.731178283691406, "incorrect_loss_raw": 16.873423258463543, "correct_loss_per_char": 0.6067587404834981, "incorrect_loss_per_char": 0.5113351692844557, "correct_loss_per_token": 2.9731178283691406, "incorrect_loss_per_token": 2.410489036923363, "correct_loss_uncond": -21.562957763671875, "incorrect_loss_uncond": -16.071112314860027}, "model_output": [{"sum_logits": -17.500873565673828, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.22612380981445, "logits_per_token": -2.5001247950962613, "logits_per_char": -0.5469022989273071, "num_chars": 32}, {"sum_logits": -14.589725494384766, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.35991668701172, "logits_per_token": -2.084246499197824, "logits_per_char": -0.44211289376923535, "num_chars": 33}, {"sum_logits": -18.52967071533203, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.24756622314453, "logits_per_token": -2.6470958164760043, "logits_per_char": -0.5449903151568245, "num_chars": 34}, {"sum_logits": -29.731178283691406, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -51.29413604736328, "logits_per_token": -2.9731178283691406, "logits_per_char": -0.6067587404834981, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 376, "native_id": "Mercury_SC_405937", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.705018997192383, "incorrect_loss_raw": 19.58194923400879, "correct_loss_per_char": 0.3346558274893925, "incorrect_loss_per_char": 0.5304962208929943, "correct_loss_per_token": 2.4262547492980957, "incorrect_loss_per_token": 3.652374606662326, "correct_loss_uncond": -14.860471725463867, "incorrect_loss_uncond": -14.506953557332357}, "model_output": [{"sum_logits": -23.761371612548828, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -38.94708251953125, "logits_per_token": -3.960228602091471, "logits_per_char": -0.6988638709573185, "num_chars": 34}, {"sum_logits": -9.705018997192383, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -24.56549072265625, "logits_per_token": -2.4262547492980957, "logits_per_char": -0.3346558274893925, "num_chars": 29}, {"sum_logits": -11.61461067199707, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.84561538696289, "logits_per_token": -2.322922134399414, "logits_per_char": -0.32262807422214085, "num_chars": 36}, {"sum_logits": -23.36986541748047, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -36.4740104675293, "logits_per_token": -4.673973083496094, "logits_per_char": -0.5699967174995236, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 377, "native_id": "Mercury_SC_416459", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.835460662841797, "incorrect_loss_raw": 11.597065289815268, "correct_loss_per_char": 1.2835460662841798, "incorrect_loss_per_char": 1.3040221271060761, "correct_loss_per_token": 6.417730331420898, "incorrect_loss_per_token": 3.9982912010616727, "correct_loss_uncond": -5.781118392944336, "incorrect_loss_uncond": -5.701898256937663}, "model_output": [{"sum_logits": -8.913409233093262, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.281841278076172, "logits_per_token": -2.9711364110310874, "logits_per_char": -1.2733441761561803, "num_chars": 7}, {"sum_logits": -15.660624504089355, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.686491012573242, "logits_per_token": -3.915156126022339, "logits_per_char": -1.9575780630111694, "num_chars": 8}, {"sum_logits": -12.835460662841797, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -18.616579055786133, "logits_per_token": -6.417730331420898, "logits_per_char": -1.2835460662841798, "num_chars": 10}, {"sum_logits": -10.217162132263184, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -16.928558349609375, "logits_per_token": -5.108581066131592, "logits_per_char": -0.6811441421508789, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 378, "native_id": "NAEP_2000_8_S21+4", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.889007568359375, "incorrect_loss_raw": 9.316971937815348, "correct_loss_per_char": 0.3563963692143278, "incorrect_loss_per_char": 0.3496372694921012, "correct_loss_per_token": 1.574083964029948, "incorrect_loss_per_token": 2.1147607962290444, "correct_loss_uncond": -14.704116821289062, "incorrect_loss_uncond": -12.73229455947876}, "model_output": [{"sum_logits": -6.977842807769775, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -15.279742240905762, "logits_per_token": -3.4889214038848877, "logits_per_char": -0.465189520517985, "num_chars": 15}, {"sum_logits": -7.580536842346191, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -17.50788116455078, "logits_per_token": -1.5161073684692383, "logits_per_char": -0.34456985647028143, "num_chars": 22}, {"sum_logits": -18.889007568359375, "num_tokens": 12, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -33.59312438964844, "logits_per_token": -1.574083964029948, "logits_per_char": -0.3563963692143278, "num_chars": 53}, {"sum_logits": -13.392536163330078, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -33.36017608642578, "logits_per_token": -1.3392536163330078, "logits_per_char": -0.2391524314880371, "num_chars": 56}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 379, "native_id": "Mercury_7072380", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.1527481079101562, "incorrect_loss_raw": 5.911665916442871, "correct_loss_per_char": 0.3587913513183594, "incorrect_loss_per_char": 0.7927702949160622, "correct_loss_per_token": 1.0763740539550781, "incorrect_loss_per_token": 2.9558329582214355, "correct_loss_uncond": -14.073585510253906, "incorrect_loss_uncond": -10.800695101420084}, "model_output": [{"sum_logits": -5.609650135040283, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -17.414684295654297, "logits_per_token": -2.8048250675201416, "logits_per_char": -0.7012062668800354, "num_chars": 8}, {"sum_logits": -2.1527481079101562, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -16.226333618164062, "logits_per_token": -1.0763740539550781, "logits_per_char": -0.3587913513183594, "num_chars": 6}, {"sum_logits": -9.040425300598145, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -16.362590789794922, "logits_per_token": -4.520212650299072, "logits_per_char": -1.2914893286568778, "num_chars": 7}, {"sum_logits": -3.0849223136901855, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -16.35980796813965, "logits_per_token": -1.5424611568450928, "logits_per_char": -0.3856152892112732, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 380, "native_id": "Mercury_SC_401373", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.487323760986328, "incorrect_loss_raw": 30.40784517923991, "correct_loss_per_char": 0.548471311243569, "incorrect_loss_per_char": 0.7440296394304503, "correct_loss_per_token": 2.498591528998481, "incorrect_loss_per_token": 3.457810569692541, "correct_loss_uncond": -14.55563735961914, "incorrect_loss_uncond": -8.735469182332357}, "model_output": [{"sum_logits": -22.487323760986328, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -37.04296112060547, "logits_per_token": -2.498591528998481, "logits_per_char": -0.548471311243569, "num_chars": 41}, {"sum_logits": -38.26746368408203, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -45.48908996582031, "logits_per_token": -4.251940409342448, "logits_per_char": -1.0070385180021588, "num_chars": 38}, {"sum_logits": -17.098798751831055, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -28.863895416259766, "logits_per_token": -2.137349843978882, "logits_per_char": -0.4621296959954339, "num_chars": 37}, {"sum_logits": -35.85727310180664, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -43.07695770263672, "logits_per_token": -3.9841414557562933, "logits_per_char": -0.7629207042937584, "num_chars": 47}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 381, "native_id": "Mercury_SC_400579", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.643848419189453, "incorrect_loss_raw": 17.64303207397461, "correct_loss_per_char": 0.46575393676757815, "incorrect_loss_per_char": 0.846113770406907, "correct_loss_per_token": 2.9109621047973633, "incorrect_loss_per_token": 4.7002440558539496, "correct_loss_uncond": -15.298931121826172, "incorrect_loss_uncond": -8.333280563354492}, "model_output": [{"sum_logits": -10.421497344970703, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -18.374168395996094, "logits_per_token": -3.473832448323568, "logits_per_char": -0.8016536419208233, "num_chars": 13}, {"sum_logits": -22.383739471435547, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.332853317260742, "logits_per_token": -5.595934867858887, "logits_per_char": -1.0658923557826452, "num_chars": 21}, {"sum_logits": -11.643848419189453, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.942779541015625, "logits_per_token": -2.9109621047973633, "logits_per_char": -0.46575393676757815, "num_chars": 25}, {"sum_logits": -20.123859405517578, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -28.22191619873047, "logits_per_token": -5.0309648513793945, "logits_per_char": -0.6707953135172526, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 382, "native_id": "MCAS_2003_5_14", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.486371994018555, "incorrect_loss_raw": 13.853337605794271, "correct_loss_per_char": 0.2552527109781901, "incorrect_loss_per_char": 0.36456151594195446, "correct_loss_per_token": 1.1486371994018554, "incorrect_loss_per_token": 1.6558790383515536, "correct_loss_uncond": -20.314252853393555, "incorrect_loss_uncond": -16.009907404581707}, "model_output": [{"sum_logits": -12.742486000061035, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.522083282470703, "logits_per_token": -1.5928107500076294, "logits_per_char": -0.3353285789489746, "num_chars": 38}, {"sum_logits": -16.370243072509766, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -32.356903076171875, "logits_per_token": -1.8189158969455295, "logits_per_char": -0.43079587032920436, "num_chars": 38}, {"sum_logits": -11.486371994018555, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -31.80062484741211, "logits_per_token": -1.1486371994018554, "logits_per_char": -0.2552527109781901, "num_chars": 45}, {"sum_logits": -12.447283744812012, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.71074867248535, "logits_per_token": -1.5559104681015015, "logits_per_char": -0.32756009854768453, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 383, "native_id": "MSA_2015_8_30", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 14.801443099975586, "incorrect_loss_raw": 19.59969647725423, "correct_loss_per_char": 0.42289837428501675, "incorrect_loss_per_char": 0.4982275429893943, "correct_loss_per_token": 1.6446047888861761, "incorrect_loss_per_token": 1.9088059974439215, "correct_loss_uncond": -20.111940383911133, "incorrect_loss_uncond": -15.57935651143392}, "model_output": [{"sum_logits": -14.801443099975586, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -34.91338348388672, "logits_per_token": -1.6446047888861761, "logits_per_char": -0.42289837428501675, "num_chars": 35}, {"sum_logits": -19.630760192871094, "num_tokens": 10, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -35.67007064819336, "logits_per_token": -1.9630760192871093, "logits_per_char": -0.5773752997903263, "num_chars": 34}, {"sum_logits": -16.884004592895508, "num_tokens": 11, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -33.92431640625, "logits_per_token": -1.5349095084450461, "logits_per_char": -0.4221001148223877, "num_chars": 40}, {"sum_logits": -22.284324645996094, "num_tokens": 10, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -35.942771911621094, "logits_per_token": -2.228432464599609, "logits_per_char": -0.49520721435546877, "num_chars": 45}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 384, "native_id": "Mercury_SC_415416", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 31.785568237304688, "incorrect_loss_raw": 30.524027506510418, "correct_loss_per_char": 0.7752577618854802, "incorrect_loss_per_char": 0.6705219395426686, "correct_loss_per_token": 3.531729804144965, "incorrect_loss_per_token": 3.0246686790928696, "correct_loss_uncond": -5.193321228027344, "incorrect_loss_uncond": -4.360976537068685}, "model_output": [{"sum_logits": -29.18094253540039, "num_tokens": 8, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -31.55232048034668, "logits_per_token": -3.647617816925049, "logits_per_char": -0.788674122578389, "num_chars": 37}, {"sum_logits": -32.41043472290039, "num_tokens": 12, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -37.91991424560547, "logits_per_token": -2.700869560241699, "logits_per_char": -0.6232775908250076, "num_chars": 52}, {"sum_logits": -31.785568237304688, "num_tokens": 9, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -36.97888946533203, "logits_per_token": -3.531729804144965, "logits_per_char": -0.7752577618854802, "num_chars": 41}, {"sum_logits": -29.98070526123047, "num_tokens": 11, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.182777404785156, "logits_per_token": -2.725518660111861, "logits_per_char": -0.5996141052246093, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 385, "native_id": "NYSEDREGENTS_2012_8_42", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 0.9049593210220337, "incorrect_loss_raw": 3.4717239141464233, "correct_loss_per_char": 0.10055103566911486, "incorrect_loss_per_char": 0.2473195294539134, "correct_loss_per_token": 0.9049593210220337, "incorrect_loss_per_token": 1.573288639386495, "correct_loss_uncond": -10.879258513450623, "incorrect_loss_uncond": -10.679189324378967}, "model_output": [{"sum_logits": -1.2481420040130615, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.415567398071289, "logits_per_token": -1.2481420040130615, "logits_per_char": -0.13868244489034018, "num_chars": 9}, {"sum_logits": -0.9049593210220337, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": true, "sum_logits_uncond": -11.784217834472656, "logits_per_token": -0.9049593210220337, "logits_per_char": -0.10055103566911486, "num_chars": 9}, {"sum_logits": -5.695305824279785, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -15.886259078979492, "logits_per_token": -1.8984352747599285, "logits_per_char": -0.3559566140174866, "num_chars": 16}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 386, "native_id": "NCEOGA_2013_5_9", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.167764663696289, "incorrect_loss_raw": 10.59757693608602, "correct_loss_per_char": 0.5648758146497939, "incorrect_loss_per_char": 0.6058931013078211, "correct_loss_per_token": 3.389254887898763, "incorrect_loss_per_token": 5.29878846804301, "correct_loss_uncond": -11.94175910949707, "incorrect_loss_uncond": -9.727135817209879}, "model_output": [{"sum_logits": -8.226197242736816, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.270023345947266, "logits_per_token": -4.113098621368408, "logits_per_char": -0.45701095792982316, "num_chars": 18}, {"sum_logits": -10.167764663696289, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.10952377319336, "logits_per_token": -3.389254887898763, "logits_per_char": -0.5648758146497939, "num_chars": 18}, {"sum_logits": -15.733443260192871, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -20.9197940826416, "logits_per_token": -7.8667216300964355, "logits_per_char": -0.9254966623642865, "num_chars": 17}, {"sum_logits": -7.833090305328369, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -21.784320831298828, "logits_per_token": -3.9165451526641846, "logits_per_char": -0.4351716836293538, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 387, "native_id": "MEAP_2005_8_45", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.97951316833496, "incorrect_loss_raw": 13.180996894836426, "correct_loss_per_char": 0.32652909939105695, "incorrect_loss_per_char": 0.3026265538037121, "correct_loss_per_token": 1.5435921062122693, "incorrect_loss_per_token": 1.6009470268532082, "correct_loss_uncond": -26.18501853942871, "incorrect_loss_uncond": -22.549817721048992}, "model_output": [{"sum_logits": -12.889026641845703, "num_tokens": 7, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -29.209980010986328, "logits_per_token": -1.8412895202636719, "logits_per_char": -0.3483520714012352, "num_chars": 37}, {"sum_logits": -11.901739120483398, "num_tokens": 9, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -38.83316421508789, "logits_per_token": -1.3224154578314886, "logits_per_char": -0.26448309156629773, "num_chars": 45}, {"sum_logits": -14.752224922180176, "num_tokens": 9, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -39.14929962158203, "logits_per_token": -1.639136102464464, "logits_per_char": -0.29504449844360353, "num_chars": 50}, {"sum_logits": -16.97951316833496, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -43.16453170776367, "logits_per_token": -1.5435921062122693, "logits_per_char": -0.32652909939105695, "num_chars": 52}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 388, "native_id": "Mercury_SC_400594", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.28050422668457, "incorrect_loss_raw": 11.380703608194986, "correct_loss_per_char": 0.3437223787660952, "incorrect_loss_per_char": 0.6472564403762683, "correct_loss_per_token": 1.856100845336914, "incorrect_loss_per_token": 2.652348327636719, "correct_loss_uncond": -14.234685897827148, "incorrect_loss_uncond": -10.91989835103353}, "model_output": [{"sum_logits": -11.478328704833984, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.07386016845703, "logits_per_token": -2.869582176208496, "logits_per_char": -0.819880621773856, "num_chars": 14}, {"sum_logits": -11.094127655029297, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.714859008789062, "logits_per_token": -2.773531913757324, "logits_per_char": -0.693382978439331, "num_chars": 16}, {"sum_logits": -11.56965446472168, "num_tokens": 5, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -25.113086700439453, "logits_per_token": -2.313930892944336, "logits_per_char": -0.4285057209156178, "num_chars": 27}, {"sum_logits": -9.28050422668457, "num_tokens": 5, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -23.51519012451172, "logits_per_token": -1.856100845336914, "logits_per_char": -0.3437223787660952, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 389, "native_id": "NCEOGA_2013_8_43", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.99532127380371, "incorrect_loss_raw": 20.146446863810223, "correct_loss_per_char": 0.29570875033526356, "incorrect_loss_per_char": 0.33788766582018304, "correct_loss_per_token": 1.4996658052716936, "incorrect_loss_per_token": 1.806740729557483, "correct_loss_uncond": -18.924699783325195, "incorrect_loss_uncond": -13.059354146321615}, "model_output": [{"sum_logits": -20.99532127380371, "num_tokens": 14, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -39.920021057128906, "logits_per_token": -1.4996658052716936, "logits_per_char": -0.29570875033526356, "num_chars": 71}, {"sum_logits": -18.57408332824707, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -32.69567108154297, "logits_per_token": -2.0637870364718967, "logits_per_char": -0.37906292506626676, "num_chars": 49}, {"sum_logits": -20.644460678100586, "num_tokens": 13, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -31.75812339782715, "logits_per_token": -1.5880354367769682, "logits_per_char": -0.30812627877762067, "num_chars": 67}, {"sum_logits": -21.220796585083008, "num_tokens": 12, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -35.16360855102539, "logits_per_token": -1.768399715423584, "logits_per_char": -0.3264737936166617, "num_chars": 65}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 390, "native_id": "MCAS_2006_8_13", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.234033584594727, "incorrect_loss_raw": 23.30948766072591, "correct_loss_per_char": 0.38085083961486815, "incorrect_loss_per_char": 0.6179714285450064, "correct_loss_per_token": 1.5234033584594726, "incorrect_loss_per_token": 2.58994307341399, "correct_loss_uncond": -22.06068992614746, "incorrect_loss_uncond": -14.22769037882487}, "model_output": [{"sum_logits": -23.52313995361328, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -41.70105743408203, "logits_per_token": -2.6136822170681424, "logits_per_char": -0.588078498840332, "num_chars": 40}, {"sum_logits": -21.619462966918945, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -35.72724533081055, "logits_per_token": -2.402162551879883, "logits_per_char": -0.5147491182599749, "num_chars": 42}, {"sum_logits": -15.234033584594727, "num_tokens": 10, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.29472351074219, "logits_per_token": -1.5234033584594726, "logits_per_char": -0.38085083961486815, "num_chars": 40}, {"sum_logits": -24.785860061645508, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -35.183231353759766, "logits_per_token": -2.7539844512939453, "logits_per_char": -0.7510866685347124, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 391, "native_id": "Mercury_7168823", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 33.487796783447266, "incorrect_loss_raw": 20.408263206481934, "correct_loss_per_char": 0.8371949195861816, "incorrect_loss_per_char": 0.5086878072129505, "correct_loss_per_token": 3.7208663092719183, "incorrect_loss_per_token": 2.62967924844651, "correct_loss_uncond": -10.203113555908203, "incorrect_loss_uncond": -12.419308026631674}, "model_output": [{"sum_logits": -19.763172149658203, "num_tokens": 8, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -30.458871841430664, "logits_per_token": -2.4703965187072754, "logits_per_char": -0.5067480038373898, "num_chars": 39}, {"sum_logits": -13.212586402893066, "num_tokens": 7, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -21.852622985839844, "logits_per_token": -1.8875123432704382, "logits_per_char": -0.3303146600723267, "num_chars": 40}, {"sum_logits": -28.24903106689453, "num_tokens": 8, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -46.17121887207031, "logits_per_token": -3.5311288833618164, "logits_per_char": -0.6890007577291349, "num_chars": 41}, {"sum_logits": -33.487796783447266, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -43.69091033935547, "logits_per_token": -3.7208663092719183, "logits_per_char": -0.8371949195861816, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 392, "native_id": "Mercury_7158935", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.942039489746094, "incorrect_loss_raw": 14.387092590332031, "correct_loss_per_char": 0.557118808521944, "incorrect_loss_per_char": 0.40324691419318803, "correct_loss_per_token": 3.157006581624349, "incorrect_loss_per_token": 2.161023525964646, "correct_loss_uncond": -18.002330780029297, "incorrect_loss_uncond": -20.536911646525066}, "model_output": [{"sum_logits": -13.321297645568848, "num_tokens": 6, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -31.89040184020996, "logits_per_token": -2.2202162742614746, "logits_per_char": -0.403675686229359, "num_chars": 33}, {"sum_logits": -13.969874382019043, "num_tokens": 7, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -36.45710754394531, "logits_per_token": -1.9956963402884347, "logits_per_char": -0.39913926805768696, "num_chars": 35}, {"sum_logits": -18.942039489746094, "num_tokens": 6, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -36.94437026977539, "logits_per_token": -3.157006581624349, "logits_per_char": -0.557118808521944, "num_chars": 34}, {"sum_logits": -15.870105743408203, "num_tokens": 7, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -36.424503326416016, "logits_per_token": -2.267157963344029, "logits_per_char": -0.406925788292518, "num_chars": 39}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 393, "native_id": "Mercury_7172708", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.311514377593994, "incorrect_loss_raw": 10.777803897857666, "correct_loss_per_char": 0.45696964859962463, "incorrect_loss_per_char": 0.6189705232779185, "correct_loss_per_token": 1.8278785943984985, "incorrect_loss_per_token": 2.6944509744644165, "correct_loss_uncond": -12.106339931488037, "incorrect_loss_uncond": -10.848689556121826}, "model_output": [{"sum_logits": -7.309375286102295, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.639373779296875, "logits_per_token": -1.8273438215255737, "logits_per_char": -0.45683595538139343, "num_chars": 16}, {"sum_logits": -7.311514377593994, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -19.41785430908203, "logits_per_token": -1.8278785943984985, "logits_per_char": -0.45696964859962463, "num_chars": 16}, {"sum_logits": -11.909903526306152, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -21.42069435119629, "logits_per_token": -2.977475881576538, "logits_per_char": -0.7443689703941345, "num_chars": 16}, {"sum_logits": -13.11413288116455, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -24.819412231445312, "logits_per_token": -3.2785332202911377, "logits_per_char": -0.6557066440582275, "num_chars": 20}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 394, "native_id": "ACTAAP_2010_5_1", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.039993286132812, "incorrect_loss_raw": 27.050582885742188, "correct_loss_per_char": 0.4517645742378983, "incorrect_loss_per_char": 0.448106083560992, "correct_loss_per_token": 1.9199994405110676, "incorrect_loss_per_token": 2.185320018332599, "correct_loss_uncond": -25.962886810302734, "incorrect_loss_uncond": -19.49600346883138}, "model_output": [{"sum_logits": -23.039993286132812, "num_tokens": 12, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -49.00288009643555, "logits_per_token": -1.9199994405110676, "logits_per_char": -0.4517645742378983, "num_chars": 51}, {"sum_logits": -32.88064956665039, "num_tokens": 11, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -49.16105651855469, "logits_per_token": -2.989149960604581, "logits_per_char": -0.6203896144651017, "num_chars": 53}, {"sum_logits": -21.635149002075195, "num_tokens": 13, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -47.628150939941406, "logits_per_token": -1.6642422309288611, "logits_per_char": -0.3434150635250031, "num_chars": 63}, {"sum_logits": -26.635950088500977, "num_tokens": 14, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -42.85055160522461, "logits_per_token": -1.9025678634643555, "logits_per_char": -0.3805135726928711, "num_chars": 70}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 395, "native_id": "Mercury_7093048", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.992913246154785, "incorrect_loss_raw": 12.690537452697754, "correct_loss_per_char": 0.40714493504276983, "incorrect_loss_per_char": 0.7505108833312987, "correct_loss_per_token": 2.198582649230957, "incorrect_loss_per_token": 4.074874507056342, "correct_loss_uncond": -17.25459575653076, "incorrect_loss_uncond": -9.429495811462402}, "model_output": [{"sum_logits": -12.811755180358887, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.06949234008789, "logits_per_token": -6.405877590179443, "logits_per_char": -1.2811755180358886, "num_chars": 10}, {"sum_logits": -9.652618408203125, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.01309585571289, "logits_per_token": -3.2175394694010415, "logits_per_char": -0.48263092041015626, "num_chars": 20}, {"sum_logits": -10.992913246154785, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -28.247509002685547, "logits_per_token": -2.198582649230957, "logits_per_char": -0.40714493504276983, "num_chars": 27}, {"sum_logits": -15.60723876953125, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -25.277511596679688, "logits_per_token": -2.6012064615885415, "logits_per_char": -0.48772621154785156, "num_chars": 32}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 396, "native_id": "Mercury_7081603", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.061731338500977, "incorrect_loss_raw": 23.016281127929688, "correct_loss_per_char": 0.46002838259837664, "incorrect_loss_per_char": 0.48662132306687794, "correct_loss_per_token": 2.5510664853182705, "incorrect_loss_per_token": 2.1441574102113727, "correct_loss_uncond": -4.685094833374023, "incorrect_loss_uncond": -6.136069615681966}, "model_output": [{"sum_logits": -18.737539291381836, "num_tokens": 9, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -22.144977569580078, "logits_per_token": -2.0819488101535373, "logits_per_char": -0.5678042209509647, "num_chars": 33}, {"sum_logits": -20.818334579467773, "num_tokens": 10, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -28.8082332611084, "logits_per_token": -2.0818334579467774, "logits_per_char": -0.41636669158935546, "num_chars": 50}, {"sum_logits": -28.061731338500977, "num_tokens": 11, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -32.746826171875, "logits_per_token": -2.5510664853182705, "logits_per_char": -0.46002838259837664, "num_chars": 61}, {"sum_logits": -29.492969512939453, "num_tokens": 13, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -36.503841400146484, "logits_per_token": -2.268689962533804, "logits_per_char": -0.47569305666031375, "num_chars": 62}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 397, "native_id": "Mercury_SC_LBS11003", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 26.00657081604004, "incorrect_loss_raw": 26.0122807820638, "correct_loss_per_char": 0.5533312939582987, "incorrect_loss_per_char": 0.635138527387805, "correct_loss_per_token": 2.8896189795600042, "incorrect_loss_per_token": 3.3827045531499955, "correct_loss_uncond": -14.324468612670898, "incorrect_loss_uncond": -11.124094009399414}, "model_output": [{"sum_logits": -34.87183380126953, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -46.437713623046875, "logits_per_token": -4.358979225158691, "logits_per_char": -0.7419539106653091, "num_chars": 47}, {"sum_logits": -21.1285400390625, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -33.114803314208984, "logits_per_token": -2.6410675048828125, "logits_per_char": -0.515330244855183, "num_chars": 41}, {"sum_logits": -22.036468505859375, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.85660743713379, "logits_per_token": -3.148066929408482, "logits_per_char": -0.6481314266429228, "num_chars": 34}, {"sum_logits": -26.00657081604004, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -40.33103942871094, "logits_per_token": -2.8896189795600042, "logits_per_char": -0.5533312939582987, "num_chars": 47}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 398, "native_id": "MCAS_2005_8_2", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.666328430175781, "incorrect_loss_raw": 9.109538714090982, "correct_loss_per_char": 0.18517981635199654, "incorrect_loss_per_char": 0.16603441382128178, "correct_loss_per_token": 0.9721940358479818, "incorrect_loss_per_token": 0.8244471012017666, "correct_loss_uncond": -20.170305252075195, "incorrect_loss_uncond": -20.084173520406086}, "model_output": [{"sum_logits": -8.415702819824219, "num_tokens": 10, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -28.776294708251953, "logits_per_token": -0.8415702819824219, "logits_per_char": -0.17174903713926978, "num_chars": 49}, {"sum_logits": -7.6670331954956055, "num_tokens": 10, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.31925392150879, "logits_per_token": -0.7667033195495605, "logits_per_char": -0.1533406639099121, "num_chars": 50}, {"sum_logits": -11.666328430175781, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -31.836633682250977, "logits_per_token": -0.9721940358479818, "logits_per_char": -0.18517981635199654, "num_chars": 63}, {"sum_logits": -11.245880126953125, "num_tokens": 13, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -31.48558807373047, "logits_per_token": -0.8650677020733173, "logits_per_char": -0.17301354041466346, "num_chars": 65}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 399, "native_id": "ACTAAP_2010_7_14", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.410430908203125, "incorrect_loss_raw": 30.291287740071613, "correct_loss_per_char": 0.46200783469460227, "incorrect_loss_per_char": 0.5750587794969609, "correct_loss_per_token": 1.9546485314002404, "incorrect_loss_per_token": 2.6898135079277883, "correct_loss_uncond": -8.870185852050781, "incorrect_loss_uncond": -7.917245229085286}, "model_output": [{"sum_logits": -32.838531494140625, "num_tokens": 11, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -38.69389343261719, "logits_per_token": -2.985321044921875, "logits_per_char": -0.6438927743949142, "num_chars": 51}, {"sum_logits": -32.71512222290039, "num_tokens": 11, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -39.05622863769531, "logits_per_token": -2.974102020263672, "logits_per_char": -0.6291369658250076, "num_chars": 52}, {"sum_logits": -25.410430908203125, "num_tokens": 13, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -34.280616760253906, "logits_per_token": -1.9546485314002404, "logits_per_char": -0.46200783469460227, "num_chars": 55}, {"sum_logits": -25.320209503173828, "num_tokens": 12, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -36.8754768371582, "logits_per_token": -2.110017458597819, "logits_per_char": -0.4521465982709612, "num_chars": 56}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 400, "native_id": "NYSEDREGENTS_2008_4_15", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 1.385011911392212, "incorrect_loss_raw": 1.9914895296096802, "correct_loss_per_char": 0.23083531856536865, "incorrect_loss_per_char": 0.4265514612197876, "correct_loss_per_token": 1.385011911392212, "incorrect_loss_per_token": 1.9914895296096802, "correct_loss_uncond": -10.631229162216187, "incorrect_loss_uncond": -10.646589159965515}, "model_output": [{"sum_logits": -2.6516551971435547, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.094873428344727, "logits_per_token": -2.6516551971435547, "logits_per_char": -0.6629137992858887, "num_chars": 4}, {"sum_logits": -1.3313238620758057, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": true, "sum_logits_uncond": -13.181283950805664, "logits_per_token": -1.3313238620758057, "logits_per_char": -0.19018912315368652, "num_chars": 7}, {"sum_logits": -1.385011911392212, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.016241073608398, "logits_per_token": -1.385011911392212, "logits_per_char": -0.23083531856536865, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 401, "native_id": "Mercury_7107240", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.842609405517578, "incorrect_loss_raw": 29.423010508219402, "correct_loss_per_char": 0.5280869801839193, "incorrect_loss_per_char": 0.5612119063352927, "correct_loss_per_token": 3.1685218811035156, "incorrect_loss_per_token": 3.178873975052793, "correct_loss_uncond": -11.701534271240234, "incorrect_loss_uncond": -10.569405237833658}, "model_output": [{"sum_logits": -15.842609405517578, "num_tokens": 5, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -27.544143676757812, "logits_per_token": -3.1685218811035156, "logits_per_char": -0.5280869801839193, "num_chars": 30}, {"sum_logits": -18.061359405517578, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -25.558805465698242, "logits_per_token": -3.010226567586263, "logits_per_char": -0.45153398513793946, "num_chars": 40}, {"sum_logits": -32.92980194091797, "num_tokens": 9, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -46.672325134277344, "logits_per_token": -3.6588668823242188, "logits_per_char": -0.6585960388183594, "num_chars": 50}, {"sum_logits": -37.277870178222656, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -47.746116638183594, "logits_per_token": -2.8675284752478967, "logits_per_char": -0.5735056950495794, "num_chars": 65}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 402, "native_id": "Mercury_7218628", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 22.85421371459961, "incorrect_loss_raw": 29.987797419230144, "correct_loss_per_char": 0.5314933421999909, "incorrect_loss_per_char": 0.6592155358730218, "correct_loss_per_token": 2.5393570793999567, "incorrect_loss_per_token": 2.9948532220089077, "correct_loss_uncond": -6.271142959594727, "incorrect_loss_uncond": -5.243466059366862}, "model_output": [{"sum_logits": -29.504776000976562, "num_tokens": 10, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -31.02548599243164, "logits_per_token": -2.9504776000976562, "logits_per_char": -0.5900955200195312, "num_chars": 50}, {"sum_logits": -26.623289108276367, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -33.88129806518555, "logits_per_token": -2.9581432342529297, "logits_per_char": -0.682648438673753, "num_chars": 39}, {"sum_logits": -22.85421371459961, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -29.125356674194336, "logits_per_token": -2.5393570793999567, "logits_per_char": -0.5314933421999909, "num_chars": 43}, {"sum_logits": -33.8353271484375, "num_tokens": 11, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -40.78700637817383, "logits_per_token": -3.0759388316761362, "logits_per_char": -0.7049026489257812, "num_chars": 48}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 403, "native_id": "MSA_2013_5_23", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.274757385253906, "incorrect_loss_raw": 14.786112149556478, "correct_loss_per_char": 0.5377607954309341, "incorrect_loss_per_char": 0.41653135420305004, "correct_loss_per_token": 3.1593446731567383, "incorrect_loss_per_token": 2.5302070799328034, "correct_loss_uncond": -13.710243225097656, "incorrect_loss_uncond": -13.95181941986084}, "model_output": [{"sum_logits": -14.374048233032227, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -27.495620727539062, "logits_per_token": -2.395674705505371, "logits_per_char": -0.4491890072822571, "num_chars": 32}, {"sum_logits": -15.950843811035156, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -29.2581787109375, "logits_per_token": -3.190168762207031, "logits_per_char": -0.43110388678473394, "num_chars": 37}, {"sum_logits": -14.03344440460205, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -29.45999526977539, "logits_per_token": -2.0047777720860074, "logits_per_char": -0.3693011685421592, "num_chars": 38}, {"sum_logits": -25.274757385253906, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -38.98500061035156, "logits_per_token": -3.1593446731567383, "logits_per_char": -0.5377607954309341, "num_chars": 47}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 404, "native_id": "Mercury_7081725", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.44321060180664, "incorrect_loss_raw": 22.19443702697754, "correct_loss_per_char": 0.47680044174194336, "incorrect_loss_per_char": 0.6740222027989663, "correct_loss_per_token": 5.72160530090332, "incorrect_loss_per_token": 5.31229419708252, "correct_loss_uncond": -8.508766174316406, "incorrect_loss_uncond": -9.463674545288086}, "model_output": [{"sum_logits": -11.44321060180664, "num_tokens": 2, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -19.951976776123047, "logits_per_token": -5.72160530090332, "logits_per_char": -0.47680044174194336, "num_chars": 24}, {"sum_logits": -22.631961822509766, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -33.80984115600586, "logits_per_token": -5.657990455627441, "logits_per_char": -0.7543987274169922, "num_chars": 30}, {"sum_logits": -14.178903579711914, "num_tokens": 5, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -27.730159759521484, "logits_per_token": -2.835780715942383, "logits_per_char": -0.4170265758738798, "num_chars": 34}, {"sum_logits": -29.772445678710938, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -33.43433380126953, "logits_per_token": -7.443111419677734, "logits_per_char": -0.8506413051060268, "num_chars": 35}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 405, "native_id": "Mercury_SC_413542", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.75276184082031, "incorrect_loss_raw": 31.54936154683431, "correct_loss_per_char": 0.7616921358330305, "incorrect_loss_per_char": 0.7248691453114868, "correct_loss_per_token": 3.639195760091146, "incorrect_loss_per_token": 3.9436701933542886, "correct_loss_uncond": -9.557075500488281, "incorrect_loss_uncond": -6.784035364786784}, "model_output": [{"sum_logits": -32.75276184082031, "num_tokens": 9, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -42.309837341308594, "logits_per_token": -3.639195760091146, "logits_per_char": -0.7616921358330305, "num_chars": 43}, {"sum_logits": -28.35394859313965, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -34.07965087890625, "logits_per_token": -3.544243574142456, "logits_per_char": -0.6750940141223726, "num_chars": 42}, {"sum_logits": -44.289588928222656, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -47.490360260009766, "logits_per_token": -5.536198616027832, "logits_per_char": -0.962817150613536, "num_chars": 46}, {"sum_logits": -22.004547119140625, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -33.430179595947266, "logits_per_token": -2.750568389892578, "logits_per_char": -0.5366962711985518, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 406, "native_id": "Mercury_SC_407302", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.675825119018555, "incorrect_loss_raw": 19.652772903442383, "correct_loss_per_char": 0.5047520302437447, "incorrect_loss_per_char": 0.5283355228440682, "correct_loss_per_token": 2.6679750170026506, "incorrect_loss_per_token": 2.801741027832031, "correct_loss_uncond": -26.105188369750977, "incorrect_loss_uncond": -18.093425114949543}, "model_output": [{"sum_logits": -13.936277389526367, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -34.99331283569336, "logits_per_token": -2.7872554779052736, "logits_per_char": -0.5360106688279372, "num_chars": 26}, {"sum_logits": -22.393569946289062, "num_tokens": 6, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -36.051815032958984, "logits_per_token": -3.7322616577148438, "logits_per_char": -0.6220436096191406, "num_chars": 36}, {"sum_logits": -18.675825119018555, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -44.78101348876953, "logits_per_token": -2.6679750170026506, "logits_per_char": -0.5047520302437447, "num_chars": 37}, {"sum_logits": -22.62847137451172, "num_tokens": 12, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -42.19346618652344, "logits_per_token": -1.8857059478759766, "logits_per_char": -0.42695229008512675, "num_chars": 53}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 407, "native_id": "Mercury_175053", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.473712921142578, "incorrect_loss_raw": 5.487359523773193, "correct_loss_per_char": 0.6052652086530413, "incorrect_loss_per_char": 0.3799852005073003, "correct_loss_per_token": 2.824570973714193, "incorrect_loss_per_token": 1.8291198412577312, "correct_loss_uncond": -10.193809509277344, "incorrect_loss_uncond": -12.215862115224203}, "model_output": [{"sum_logits": -4.02160120010376, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -17.675376892089844, "logits_per_token": -1.34053373336792, "logits_per_char": -0.251350075006485, "num_chars": 16}, {"sum_logits": -4.371585845947266, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -17.403579711914062, "logits_per_token": -1.4571952819824219, "logits_per_char": -0.31225613185337614, "num_chars": 14}, {"sum_logits": -8.068891525268555, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -18.03070831298828, "logits_per_token": -2.6896305084228516, "logits_per_char": -0.5763493946620396, "num_chars": 14}, {"sum_logits": -8.473712921142578, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -18.667522430419922, "logits_per_token": -2.824570973714193, "logits_per_char": -0.6052652086530413, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 408, "native_id": "Mercury_7161315", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.517162322998047, "incorrect_loss_raw": 19.18329429626465, "correct_loss_per_char": 0.5925569032367907, "incorrect_loss_per_char": 0.5253705434631883, "correct_loss_per_token": 2.814645290374756, "incorrect_loss_per_token": 2.7215754077548073, "correct_loss_uncond": -9.317054748535156, "incorrect_loss_uncond": -11.026430130004883}, "model_output": [{"sum_logits": -12.505393981933594, "num_tokens": 6, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -27.723073959350586, "logits_per_token": -2.0842323303222656, "logits_per_char": -0.41684646606445314, "num_chars": 30}, {"sum_logits": -22.517162322998047, "num_tokens": 8, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -31.834217071533203, "logits_per_token": -2.814645290374756, "logits_per_char": -0.5925569032367907, "num_chars": 38}, {"sum_logits": -25.19623565673828, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -30.581274032592773, "logits_per_token": -3.599462236676897, "logits_per_char": -0.66305883307206, "num_chars": 38}, {"sum_logits": -19.84825325012207, "num_tokens": 8, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -32.324825286865234, "logits_per_token": -2.481031656265259, "logits_per_char": -0.49620633125305175, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 409, "native_id": "Mercury_189070", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.507967948913574, "incorrect_loss_raw": 16.83034388224284, "correct_loss_per_char": 0.2443713476491529, "incorrect_loss_per_char": 0.3441300283899871, "correct_loss_per_token": 1.1675519943237305, "incorrect_loss_per_token": 1.504760333469936, "correct_loss_uncond": -19.33432102203369, "incorrect_loss_uncond": -17.14526112874349}, "model_output": [{"sum_logits": -10.507967948913574, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.842288970947266, "logits_per_token": -1.1675519943237305, "logits_per_char": -0.2443713476491529, "num_chars": 43}, {"sum_logits": -10.432580947875977, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.099594116210938, "logits_per_token": -1.1591756608751085, "logits_per_char": -0.2219698074016165, "num_chars": 47}, {"sum_logits": -12.443443298339844, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.19861602783203, "logits_per_token": -1.3826048109266493, "logits_per_char": -0.2893824022869731, "num_chars": 43}, {"sum_logits": -27.615007400512695, "num_tokens": 14, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -43.628604888916016, "logits_per_token": -1.9725005286080497, "logits_per_char": -0.5210378754813716, "num_chars": 53}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 410, "native_id": "Mercury_7189123", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.05063247680664, "incorrect_loss_raw": 17.899864832560223, "correct_loss_per_char": 0.32869889306240396, "incorrect_loss_per_char": 0.3778418301609754, "correct_loss_per_token": 1.8227847706187854, "incorrect_loss_per_token": 1.8921705520514287, "correct_loss_uncond": -18.651599884033203, "incorrect_loss_uncond": -22.687341690063477}, "model_output": [{"sum_logits": -18.569551467895508, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -45.414249420166016, "logits_per_token": -2.3211939334869385, "logits_per_char": -0.476142345330654, "num_chars": 39}, {"sum_logits": -20.05063247680664, "num_tokens": 11, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -38.702232360839844, "logits_per_token": -1.8227847706187854, "logits_per_char": -0.32869889306240396, "num_chars": 61}, {"sum_logits": -17.78451919555664, "num_tokens": 10, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -32.75313186645508, "logits_per_token": -1.778451919555664, "logits_per_char": -0.3420099845299354, "num_chars": 52}, {"sum_logits": -17.345523834228516, "num_tokens": 11, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -43.59423828125, "logits_per_token": -1.5768658031116833, "logits_per_char": -0.31537316062233667, "num_chars": 55}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 411, "native_id": "Mercury_SC_402171", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.8936710357666, "incorrect_loss_raw": 14.037589391072592, "correct_loss_per_char": 0.5398191724504743, "incorrect_loss_per_char": 0.6841307146089792, "correct_loss_per_token": 2.0992967817518444, "incorrect_loss_per_token": 3.020088418324788, "correct_loss_uncond": -19.9998722076416, "incorrect_loss_uncond": -13.600423812866211}, "model_output": [{"sum_logits": -12.754232406616211, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -19.63114356994629, "logits_per_token": -3.1885581016540527, "logits_per_char": -0.8502821604410807, "num_chars": 15}, {"sum_logits": -17.815994262695312, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -34.489017486572266, "logits_per_token": -3.5631988525390623, "logits_per_char": -0.774608446204144, "num_chars": 23}, {"sum_logits": -11.54254150390625, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -28.79387855529785, "logits_per_token": -2.30850830078125, "logits_per_char": -0.42750153718171297, "num_chars": 27}, {"sum_logits": -18.8936710357666, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -38.8935432434082, "logits_per_token": -2.0992967817518444, "logits_per_char": -0.5398191724504743, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 412, "native_id": "Mercury_7217368", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.583172798156738, "incorrect_loss_raw": 7.875348885854085, "correct_loss_per_char": 0.6583172798156738, "incorrect_loss_per_char": 0.8707987925018927, "correct_loss_per_token": 3.291586399078369, "incorrect_loss_per_token": 4.959737141927083, "correct_loss_uncond": -9.792370796203613, "incorrect_loss_uncond": -7.828847408294678}, "model_output": [{"sum_logits": -6.132376194000244, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -15.020330429077148, "logits_per_token": -6.132376194000244, "logits_per_char": -1.0220626990000408, "num_chars": 6}, {"sum_logits": -6.583172798156738, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.37554359436035, "logits_per_token": -3.291586399078369, "logits_per_char": -0.6583172798156738, "num_chars": 10}, {"sum_logits": -9.147500991821289, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -15.49135971069336, "logits_per_token": -4.5737504959106445, "logits_per_char": -0.8315909992564808, "num_chars": 11}, {"sum_logits": -8.346169471740723, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.60089874267578, "logits_per_token": -4.173084735870361, "logits_per_char": -0.7587426792491566, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 413, "native_id": "Mercury_LBS10933", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.253443717956543, "incorrect_loss_raw": 4.935968001683553, "correct_loss_per_char": 0.5253443717956543, "incorrect_loss_per_char": 0.5650859574476877, "correct_loss_per_token": 1.3133609294891357, "incorrect_loss_per_token": 1.9488835732142131, "correct_loss_uncond": -9.256409645080566, "incorrect_loss_uncond": -9.795748472213745}, "model_output": [{"sum_logits": -5.253443717956543, "num_tokens": 4, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -14.50985336303711, "logits_per_token": -1.3133609294891357, "logits_per_char": -0.5253443717956543, "num_chars": 10}, {"sum_logits": -4.666006565093994, "num_tokens": 2, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -14.735342025756836, "logits_per_token": -2.333003282546997, "logits_per_char": -0.5832508206367493, "num_chars": 8}, {"sum_logits": -6.229205131530762, "num_tokens": 4, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -15.381747245788574, "logits_per_token": -1.5573012828826904, "logits_per_char": -0.6229205131530762, "num_chars": 10}, {"sum_logits": -3.9126923084259033, "num_tokens": 2, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -14.078060150146484, "logits_per_token": -1.9563461542129517, "logits_per_char": -0.4890865385532379, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 414, "native_id": "Mercury_7223160", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.42304801940918, "incorrect_loss_raw": 9.689228216807047, "correct_loss_per_char": 0.4979202167407886, "incorrect_loss_per_char": 0.3105013543812593, "correct_loss_per_token": 2.63186400277274, "incorrect_loss_per_token": 1.7573159641689724, "correct_loss_uncond": -17.777162551879883, "incorrect_loss_uncond": -16.284300327301025}, "model_output": [{"sum_logits": -6.425478458404541, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.95387840270996, "logits_per_token": -1.2850956916809082, "logits_per_char": -0.27936862862628437, "num_chars": 23}, {"sum_logits": -6.394535064697266, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.74803352355957, "logits_per_token": -1.278907012939453, "logits_per_char": -0.2459436563345102, "num_chars": 26}, {"sum_logits": -18.42304801940918, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.20021057128906, "logits_per_token": -2.63186400277274, "logits_per_char": -0.4979202167407886, "num_chars": 37}, {"sum_logits": -16.247671127319336, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -34.21867370605469, "logits_per_token": -2.707945187886556, "logits_per_char": -0.4061917781829834, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 415, "native_id": "Mercury_SC_401324", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 26.854328155517578, "incorrect_loss_raw": 24.110352834065754, "correct_loss_per_char": 0.5370865631103515, "incorrect_loss_per_char": 0.6599353118839427, "correct_loss_per_token": 2.4413025595925073, "incorrect_loss_per_token": 2.9596454824720113, "correct_loss_uncond": -18.298542022705078, "incorrect_loss_uncond": -12.219778696695963}, "model_output": [{"sum_logits": -22.314847946166992, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -28.31409454345703, "logits_per_token": -3.1878354208809987, "logits_per_char": -0.6973389983177185, "num_chars": 32}, {"sum_logits": -27.579198837280273, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -41.49897766113281, "logits_per_token": -3.447399854660034, "logits_per_char": -0.707158944545648, "num_chars": 39}, {"sum_logits": -22.43701171875, "num_tokens": 10, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -39.17732238769531, "logits_per_token": -2.243701171875, "logits_per_char": -0.5753079927884616, "num_chars": 39}, {"sum_logits": -26.854328155517578, "num_tokens": 11, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -45.152870178222656, "logits_per_token": -2.4413025595925073, "logits_per_char": -0.5370865631103515, "num_chars": 50}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 416, "native_id": "LEAP_2001_8_10379", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 31.56415557861328, "incorrect_loss_raw": 31.00299326578776, "correct_loss_per_char": 0.4445655715297645, "incorrect_loss_per_char": 0.40140533961916125, "correct_loss_per_token": 2.8694686889648438, "incorrect_loss_per_token": 2.027885255990205, "correct_loss_uncond": -25.779136657714844, "incorrect_loss_uncond": -21.808802286783855}, "model_output": [{"sum_logits": -31.56415557861328, "num_tokens": 11, "num_tokens_all": 261, "is_greedy": false, "sum_logits_uncond": -57.343292236328125, "logits_per_token": -2.8694686889648438, "logits_per_char": -0.4445655715297645, "num_chars": 71}, {"sum_logits": -25.046239852905273, "num_tokens": 12, "num_tokens_all": 262, "is_greedy": false, "sum_logits_uncond": -44.81977081298828, "logits_per_token": -2.087186654408773, "logits_per_char": -0.41059409594926677, "num_chars": 61}, {"sum_logits": -36.1731071472168, "num_tokens": 18, "num_tokens_all": 268, "is_greedy": false, "sum_logits_uncond": -62.58860778808594, "logits_per_token": -2.0096170637342663, "logits_per_char": -0.3807695489180715, "num_chars": 95}, {"sum_logits": -31.78963279724121, "num_tokens": 16, "num_tokens_all": 266, "is_greedy": false, "sum_logits_uncond": -51.027008056640625, "logits_per_token": -1.9868520498275757, "logits_per_char": -0.41285237399014557, "num_chars": 77}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 417, "native_id": "VASoL_2009_5_30", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 29.289321899414062, "incorrect_loss_raw": 23.32718849182129, "correct_loss_per_char": 1.0460472106933594, "incorrect_loss_per_char": 1.02919629381237, "correct_loss_per_token": 4.881553649902344, "incorrect_loss_per_token": 4.638340112898085, "correct_loss_uncond": -4.533771514892578, "incorrect_loss_uncond": -2.5940539042154946}, "model_output": [{"sum_logits": -17.40520668029785, "num_tokens": 4, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -21.01182746887207, "logits_per_token": -4.351301670074463, "logits_per_char": -0.8288193657284691, "num_chars": 21}, {"sum_logits": -24.029766082763672, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -26.969388961791992, "logits_per_token": -4.8059532165527346, "logits_per_char": -1.2014883041381836, "num_chars": 20}, {"sum_logits": -28.546592712402344, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -29.78251075744629, "logits_per_token": -4.757765452067058, "logits_per_char": -1.057281211570457, "num_chars": 27}, {"sum_logits": -29.289321899414062, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -33.82309341430664, "logits_per_token": -4.881553649902344, "logits_per_char": -1.0460472106933594, "num_chars": 28}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 418, "native_id": "Mercury_416404", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.222604751586914, "incorrect_loss_raw": 19.086037317911785, "correct_loss_per_char": 0.4570302783318286, "incorrect_loss_per_char": 0.4633354068937756, "correct_loss_per_token": 2.6914005279541016, "incorrect_loss_per_token": 2.3955089049364524, "correct_loss_uncond": -14.529104232788086, "incorrect_loss_uncond": -13.368338902791342}, "model_output": [{"sum_logits": -18.193405151367188, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -25.850482940673828, "logits_per_token": -2.599057878766741, "logits_per_char": -0.5198115757533482, "num_chars": 35}, {"sum_logits": -17.780101776123047, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.08799743652344, "logits_per_token": -2.222512722015381, "logits_per_char": -0.4445025444030762, "num_chars": 40}, {"sum_logits": -21.284605026245117, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -38.42464828491211, "logits_per_token": -2.364956114027235, "logits_per_char": -0.42569210052490236, "num_chars": 50}, {"sum_logits": -24.222604751586914, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -38.751708984375, "logits_per_token": -2.6914005279541016, "logits_per_char": -0.4570302783318286, "num_chars": 53}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 419, "native_id": "Mercury_7103530", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.840206146240234, "incorrect_loss_raw": 22.662755966186523, "correct_loss_per_char": 0.4595172230790301, "incorrect_loss_per_char": 0.6950123600543839, "correct_loss_per_token": 2.6914580208914622, "incorrect_loss_per_token": 3.759205106704954, "correct_loss_uncond": -12.884620666503906, "incorrect_loss_uncond": -9.702228546142578}, "model_output": [{"sum_logits": -16.18178367614746, "num_tokens": 5, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -29.660799026489258, "logits_per_token": -3.236356735229492, "logits_per_char": -0.6223762952364408, "num_chars": 26}, {"sum_logits": -18.840206146240234, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -31.72482681274414, "logits_per_token": -2.6914580208914622, "logits_per_char": -0.4595172230790301, "num_chars": 41}, {"sum_logits": -24.91252899169922, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -30.666690826416016, "logits_per_token": -3.5589327130998885, "logits_per_char": -0.7549251209605824, "num_chars": 33}, {"sum_logits": -26.89395523071289, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -36.76746368408203, "logits_per_token": -4.4823258717854815, "logits_per_char": -0.7077356639661287, "num_chars": 38}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 420, "native_id": "Mercury_7030870", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.966050148010254, "incorrect_loss_raw": 4.613504886627197, "correct_loss_per_char": 1.2415125370025635, "incorrect_loss_per_char": 0.7650233639611139, "correct_loss_per_token": 4.966050148010254, "incorrect_loss_per_token": 4.613504886627197, "correct_loss_uncond": -6.206540107727051, "incorrect_loss_uncond": -7.712380886077881}, "model_output": [{"sum_logits": -4.966050148010254, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -11.172590255737305, "logits_per_token": -4.966050148010254, "logits_per_char": -1.2415125370025635, "num_chars": 4}, {"sum_logits": -6.451544761657715, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.306753158569336, "logits_per_token": -6.451544761657715, "logits_per_char": -1.290308952331543, "num_chars": 5}, {"sum_logits": -3.307760715484619, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.399616241455078, "logits_per_token": -3.307760715484619, "logits_per_char": -0.5512934525807699, "num_chars": 6}, {"sum_logits": -4.081209182739258, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.27128791809082, "logits_per_token": -4.081209182739258, "logits_per_char": -0.4534676869710286, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 421, "native_id": "LEAP__7_10348", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 17.50255584716797, "incorrect_loss_raw": 11.689884503682455, "correct_loss_per_char": 0.3804903445036515, "incorrect_loss_per_char": 0.27130408323827526, "correct_loss_per_token": 1.5911414406516335, "incorrect_loss_per_token": 1.18808703631263, "correct_loss_uncond": -24.666358947753906, "incorrect_loss_uncond": -21.54463799794515}, "model_output": [{"sum_logits": -7.820000648498535, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -30.31932830810547, "logits_per_token": -0.8688889609442817, "logits_per_char": -0.20051283714098808, "num_chars": 39}, {"sum_logits": -10.797483444213867, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -30.660423278808594, "logits_per_token": -1.1997203826904297, "logits_per_char": -0.2633532547369236, "num_chars": 41}, {"sum_logits": -17.50255584716797, "num_tokens": 11, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -42.168914794921875, "logits_per_token": -1.5911414406516335, "logits_per_char": -0.3804903445036515, "num_chars": 46}, {"sum_logits": -16.45216941833496, "num_tokens": 11, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -38.72381591796875, "logits_per_token": -1.4956517653031782, "logits_per_char": -0.35004615783691406, "num_chars": 47}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 422, "native_id": "Mercury_SC_406835", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.34395217895508, "incorrect_loss_raw": 22.84044615427653, "correct_loss_per_char": 0.5674377575255277, "incorrect_loss_per_char": 0.47050678671575064, "correct_loss_per_token": 2.6953293482462564, "incorrect_loss_per_token": 2.3435510211520723, "correct_loss_uncond": -9.571514129638672, "incorrect_loss_uncond": -12.17066478729248}, "model_output": [{"sum_logits": -28.604331970214844, "num_tokens": 12, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -41.76768112182617, "logits_per_token": -2.383694330851237, "logits_per_char": -0.4206519407384536, "num_chars": 68}, {"sum_logits": -32.34395217895508, "num_tokens": 12, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -41.91546630859375, "logits_per_token": -2.6953293482462564, "logits_per_char": -0.5674377575255277, "num_chars": 57}, {"sum_logits": -24.62765121459961, "num_tokens": 10, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -36.83582305908203, "logits_per_token": -2.462765121459961, "logits_per_char": -0.5130760669708252, "num_chars": 48}, {"sum_logits": -15.289355278015137, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -26.429828643798828, "logits_per_token": -2.1841936111450195, "logits_per_char": -0.477792352437973, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 423, "native_id": "Mercury_178255", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.99679183959961, "incorrect_loss_raw": 22.45242754618327, "correct_loss_per_char": 0.5881723890117571, "incorrect_loss_per_char": 0.5209130125640467, "correct_loss_per_token": 2.4997326532999673, "incorrect_loss_per_token": 2.331942266406435, "correct_loss_uncond": -8.391170501708984, "incorrect_loss_uncond": -15.995187759399414}, "model_output": [{"sum_logits": -18.119182586669922, "num_tokens": 8, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -31.980022430419922, "logits_per_token": -2.2648978233337402, "logits_per_char": -0.5662244558334351, "num_chars": 32}, {"sum_logits": -29.99679183959961, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -38.387962341308594, "logits_per_token": -2.4997326532999673, "logits_per_char": -0.5881723890117571, "num_chars": 51}, {"sum_logits": -28.02118682861328, "num_tokens": 10, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -48.79277038574219, "logits_per_token": -2.802118682861328, "logits_per_char": -0.5961954644385804, "num_chars": 47}, {"sum_logits": -21.2169132232666, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -34.57005310058594, "logits_per_token": -1.9288102930242366, "logits_per_char": -0.40031911742012455, "num_chars": 53}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 424, "native_id": "MDSA_2012_8_16", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.018512725830078, "incorrect_loss_raw": 22.251454035441082, "correct_loss_per_char": 0.4569241896919582, "incorrect_loss_per_char": 0.4557251574417984, "correct_loss_per_token": 3.0026446751185825, "incorrect_loss_per_token": 3.331441152663458, "correct_loss_uncond": -19.36682891845703, "incorrect_loss_uncond": -17.305504480997723}, "model_output": [{"sum_logits": -19.23541259765625, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -33.33555603027344, "logits_per_token": -3.205902099609375, "logits_per_char": -0.42745361328125, "num_chars": 45}, {"sum_logits": -21.018512725830078, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.38534164428711, "logits_per_token": -3.0026446751185825, "logits_per_char": -0.4569241896919582, "num_chars": 46}, {"sum_logits": -21.9942569732666, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -42.98847961425781, "logits_per_token": -3.1420367104666576, "logits_per_char": -0.4488623872095225, "num_chars": 49}, {"sum_logits": -25.52469253540039, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -42.346839904785156, "logits_per_token": -3.6463846479143416, "logits_per_char": -0.4908594718346229, "num_chars": 52}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 425, "native_id": "Mercury_409645", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 9.477057456970215, "incorrect_loss_raw": 15.449719111124674, "correct_loss_per_char": 0.37908229827880857, "incorrect_loss_per_char": 0.5784993430076563, "correct_loss_per_token": 3.159019152323405, "incorrect_loss_per_token": 3.367078908284505, "correct_loss_uncond": -14.332337379455566, "incorrect_loss_uncond": -14.184745152791342}, "model_output": [{"sum_logits": -15.27033805847168, "num_tokens": 5, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -31.342206954956055, "logits_per_token": -3.054067611694336, "logits_per_char": -0.5265633813266096, "num_chars": 29}, {"sum_logits": -16.62810516357422, "num_tokens": 4, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -26.06781578063965, "logits_per_token": -4.157026290893555, "logits_per_char": -0.6928377151489258, "num_chars": 24}, {"sum_logits": -14.450714111328125, "num_tokens": 5, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -31.493370056152344, "logits_per_token": -2.890142822265625, "logits_per_char": -0.516096932547433, "num_chars": 28}, {"sum_logits": -9.477057456970215, "num_tokens": 3, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -23.80939483642578, "logits_per_token": -3.159019152323405, "logits_per_char": -0.37908229827880857, "num_chars": 25}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 426, "native_id": "TIMSS_2003_8_pg47", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 59.302005767822266, "incorrect_loss_raw": 37.50832494099935, "correct_loss_per_char": 0.8851045636988398, "incorrect_loss_per_char": 0.8331764472438937, "correct_loss_per_token": 4.235857554844448, "incorrect_loss_per_token": 4.031434408823649, "correct_loss_uncond": -9.592693328857422, "incorrect_loss_uncond": -5.10116449991862}, "model_output": [{"sum_logits": -38.873329162597656, "num_tokens": 10, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -41.217498779296875, "logits_per_token": -3.8873329162597656, "logits_per_char": -0.8098610242207845, "num_chars": 48}, {"sum_logits": -33.6722297668457, "num_tokens": 8, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -37.98273849487305, "logits_per_token": -4.209028720855713, "logits_per_char": -0.935339715745714, "num_chars": 36}, {"sum_logits": -59.302005767822266, "num_tokens": 14, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -68.89469909667969, "logits_per_token": -4.235857554844448, "logits_per_char": -0.8851045636988398, "num_chars": 67}, {"sum_logits": -39.97941589355469, "num_tokens": 10, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -48.628231048583984, "logits_per_token": -3.997941589355469, "logits_per_char": -0.7543286017651828, "num_chars": 53}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 427, "native_id": "NYSEDREGENTS_2010_8_16", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.094205856323242, "incorrect_loss_raw": 9.209301630655924, "correct_loss_per_char": 0.3149389120248648, "incorrect_loss_per_char": 0.5113361458373226, "correct_loss_per_token": 2.047102928161621, "incorrect_loss_per_token": 4.604650815327962, "correct_loss_uncond": -13.896596908569336, "incorrect_loss_uncond": -11.237675348917643}, "model_output": [{"sum_logits": -8.136247634887695, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -22.074535369873047, "logits_per_token": -4.068123817443848, "logits_per_char": -0.38744036356608075, "num_chars": 21}, {"sum_logits": -4.094205856323242, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -17.990802764892578, "logits_per_token": -2.047102928161621, "logits_per_char": -0.3149389120248648, "num_chars": 13}, {"sum_logits": -8.097058296203613, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -18.809829711914062, "logits_per_token": -4.048529148101807, "logits_per_char": -0.4762975468355067, "num_chars": 17}, {"sum_logits": -11.394598960876465, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -20.456565856933594, "logits_per_token": -5.697299480438232, "logits_per_char": -0.6702705271103803, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 428, "native_id": "Mercury_7159810", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.53396224975586, "incorrect_loss_raw": 18.641427040100098, "correct_loss_per_char": 0.3757718693126332, "incorrect_loss_per_char": 0.40140040593094245, "correct_loss_per_token": 1.83710691663954, "incorrect_loss_per_token": 1.8609614812966548, "correct_loss_uncond": -14.745428085327148, "incorrect_loss_uncond": -18.791353543599445}, "model_output": [{"sum_logits": -9.486403465270996, "num_tokens": 8, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -23.799013137817383, "logits_per_token": -1.1858004331588745, "logits_per_char": -0.28746677167487866, "num_chars": 33}, {"sum_logits": -16.53396224975586, "num_tokens": 9, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -31.279390335083008, "logits_per_token": -1.83710691663954, "logits_per_char": -0.3757718693126332, "num_chars": 44}, {"sum_logits": -19.300464630126953, "num_tokens": 10, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -37.713539123535156, "logits_per_token": -1.9300464630126952, "logits_per_char": -0.44884801465411517, "num_chars": 43}, {"sum_logits": -27.137413024902344, "num_tokens": 11, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -50.785789489746094, "logits_per_token": -2.467037547718395, "logits_per_char": -0.4678864314638335, "num_chars": 58}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 429, "native_id": "Mercury_7267523", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.6496548652648926, "incorrect_loss_raw": 3.0437351862589517, "correct_loss_per_char": 0.26496548652648927, "incorrect_loss_per_char": 0.2927992618273175, "correct_loss_per_token": 1.3248274326324463, "incorrect_loss_per_token": 1.5218675931294758, "correct_loss_uncond": -12.14637041091919, "incorrect_loss_uncond": -12.067908922831217}, "model_output": [{"sum_logits": -3.488889217376709, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.690596580505371, "logits_per_token": -1.7444446086883545, "logits_per_char": -0.4361111521720886, "num_chars": 8}, {"sum_logits": -2.6496548652648926, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": true, "sum_logits_uncond": -14.796025276184082, "logits_per_token": -1.3248274326324463, "logits_per_char": -0.26496548652648927, "num_chars": 10}, {"sum_logits": -3.2981791496276855, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.377877235412598, "logits_per_token": -1.6490895748138428, "logits_per_char": -0.2748482624689738, "num_chars": 12}, {"sum_logits": -2.344137191772461, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -14.266458511352539, "logits_per_token": -1.1720685958862305, "logits_per_char": -0.16743837084089006, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 430, "native_id": "Mercury_SC_401006", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 5.4526848793029785, "incorrect_loss_raw": 6.847353935241699, "correct_loss_per_char": 0.9087808132171631, "incorrect_loss_per_char": 0.750908863850129, "correct_loss_per_token": 2.7263424396514893, "incorrect_loss_per_token": 3.4236769676208496, "correct_loss_uncond": -10.773648738861084, "incorrect_loss_uncond": -9.447640419006348}, "model_output": [{"sum_logits": -5.4526848793029785, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -16.226333618164062, "logits_per_token": -2.7263424396514893, "logits_per_char": -0.9087808132171631, "num_chars": 6}, {"sum_logits": -7.189495086669922, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -16.649169921875, "logits_per_token": -3.594747543334961, "logits_per_char": -0.8986868858337402, "num_chars": 8}, {"sum_logits": -6.799919128417969, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -16.687835693359375, "logits_per_token": -3.3999595642089844, "logits_per_char": -0.8499898910522461, "num_chars": 8}, {"sum_logits": -6.552647590637207, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -15.547977447509766, "logits_per_token": -3.2763237953186035, "logits_per_char": -0.5040498146644006, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 431, "native_id": "ACTAAP_2010_7_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 25.384300231933594, "incorrect_loss_raw": 27.235890070597332, "correct_loss_per_char": 0.3133864226164641, "incorrect_loss_per_char": 0.4398665376033344, "correct_loss_per_token": 1.6922866821289062, "incorrect_loss_per_token": 2.4543665228018106, "correct_loss_uncond": -25.129165649414062, "incorrect_loss_uncond": -19.626854578653973}, "model_output": [{"sum_logits": -21.84164047241211, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -36.0146484375, "logits_per_token": -3.12023435320173, "logits_per_char": -0.4457477647431043, "num_chars": 49}, {"sum_logits": -24.5501651763916, "num_tokens": 13, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -47.68272399902344, "logits_per_token": -1.8884742443378155, "logits_per_char": -0.4091694196065267, "num_chars": 60}, {"sum_logits": -35.31586456298828, "num_tokens": 15, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -56.89086151123047, "logits_per_token": -2.3543909708658854, "logits_per_char": -0.4646824284603721, "num_chars": 76}, {"sum_logits": -25.384300231933594, "num_tokens": 15, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -50.513465881347656, "logits_per_token": -1.6922866821289062, "logits_per_char": -0.3133864226164641, "num_chars": 81}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 432, "native_id": "MEAP_2005_8_13", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.99168586730957, "incorrect_loss_raw": 18.387177149454754, "correct_loss_per_char": 0.4235878960560944, "incorrect_loss_per_char": 0.38418666840755406, "correct_loss_per_token": 3.5702408381870816, "incorrect_loss_per_token": 1.9618198534454965, "correct_loss_uncond": -15.565572738647461, "incorrect_loss_uncond": -16.35382143656413}, "model_output": [{"sum_logits": -16.154388427734375, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -30.58385467529297, "logits_per_token": -1.7949320475260417, "logits_per_char": -0.3940094738471799, "num_chars": 41}, {"sum_logits": -15.969758987426758, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -34.32600784301758, "logits_per_token": -1.9962198734283447, "logits_per_char": -0.3471686736397121, "num_chars": 46}, {"sum_logits": -23.037384033203125, "num_tokens": 11, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -39.313133239746094, "logits_per_token": -2.094307639382102, "logits_per_char": -0.4113818577357701, "num_chars": 56}, {"sum_logits": -24.99168586730957, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -40.55725860595703, "logits_per_token": -3.5702408381870816, "logits_per_char": -0.4235878960560944, "num_chars": 59}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 433, "native_id": "Mercury_7164623", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.113829612731934, "incorrect_loss_raw": 3.4545766512552896, "correct_loss_per_char": 0.25711435079574585, "incorrect_loss_per_char": 0.23698529579021313, "correct_loss_per_token": 2.056914806365967, "incorrect_loss_per_token": 1.7272883256276448, "correct_loss_uncond": -13.190762519836426, "incorrect_loss_uncond": -12.511428356170654}, "model_output": [{"sum_logits": -4.3035969734191895, "num_tokens": 2, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -16.05345916748047, "logits_per_token": -2.1517984867095947, "logits_per_char": -0.3586330811182658, "num_chars": 12}, {"sum_logits": -1.4083876609802246, "num_tokens": 2, "num_tokens_all": 205, "is_greedy": true, "sum_logits_uncond": -15.03551197052002, "logits_per_token": -0.7041938304901123, "logits_per_char": -0.09389251073201497, "num_chars": 15}, {"sum_logits": -4.113829612731934, "num_tokens": 2, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -17.30459213256836, "logits_per_token": -2.056914806365967, "logits_per_char": -0.25711435079574585, "num_chars": 16}, {"sum_logits": -4.651745319366455, "num_tokens": 2, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -16.809043884277344, "logits_per_token": -2.3258726596832275, "logits_per_char": -0.2584302955203586, "num_chars": 18}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 434, "native_id": "Mercury_417127", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.267864227294922, "incorrect_loss_raw": 25.681841532389324, "correct_loss_per_char": 0.5284394158257378, "incorrect_loss_per_char": 0.6956210719095911, "correct_loss_per_token": 2.3779773712158203, "incorrect_loss_per_token": 3.8175584550887813, "correct_loss_uncond": -15.121292114257812, "incorrect_loss_uncond": -7.581125895182292}, "model_output": [{"sum_logits": -14.267864227294922, "num_tokens": 6, "num_tokens_all": 258, "is_greedy": false, "sum_logits_uncond": -29.389156341552734, "logits_per_token": -2.3779773712158203, "logits_per_char": -0.5284394158257378, "num_chars": 27}, {"sum_logits": -18.73921775817871, "num_tokens": 6, "num_tokens_all": 258, "is_greedy": false, "sum_logits_uncond": -29.413230895996094, "logits_per_token": -3.1232029596964517, "logits_per_char": -0.6461799226958176, "num_chars": 29}, {"sum_logits": -19.770170211791992, "num_tokens": 7, "num_tokens_all": 259, "is_greedy": false, "sum_logits_uncond": -26.59969711303711, "logits_per_token": -2.8243100302559987, "logits_per_char": -0.5648620060511997, "num_chars": 35}, {"sum_logits": -38.536136627197266, "num_tokens": 7, "num_tokens_all": 259, "is_greedy": false, "sum_logits_uncond": -43.77597427368164, "logits_per_token": -5.505162375313895, "logits_per_char": -0.875821286981756, "num_chars": 44}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 435, "native_id": "Mercury_411224", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.780457019805908, "incorrect_loss_raw": 6.700610160827637, "correct_loss_per_char": 0.9634095033009847, "incorrect_loss_per_char": 0.8950510289933947, "correct_loss_per_token": 1.445114254951477, "incorrect_loss_per_token": 1.6751525402069092, "correct_loss_uncond": -9.238987445831299, "incorrect_loss_uncond": -14.069956143697103}, "model_output": [{"sum_logits": -4.138182640075684, "num_tokens": 4, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -15.115961074829102, "logits_per_token": -1.034545660018921, "logits_per_char": -0.6896971066792806, "num_chars": 6}, {"sum_logits": -5.780457019805908, "num_tokens": 4, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -15.019444465637207, "logits_per_token": -1.445114254951477, "logits_per_char": -0.9634095033009847, "num_chars": 6}, {"sum_logits": -6.149578094482422, "num_tokens": 4, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -22.859739303588867, "logits_per_token": -1.5373945236206055, "logits_per_char": -0.7686972618103027, "num_chars": 8}, {"sum_logits": -9.814069747924805, "num_tokens": 4, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -24.33599853515625, "logits_per_token": -2.453517436981201, "logits_per_char": -1.2267587184906006, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 436, "native_id": "TIMSS_2011_8_pg15", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.26010513305664, "incorrect_loss_raw": 21.66098976135254, "correct_loss_per_char": 0.3533350021120102, "incorrect_loss_per_char": 0.3367615851435197, "correct_loss_per_token": 1.7123157794658954, "incorrect_loss_per_token": 1.666229981642503, "correct_loss_uncond": -20.092174530029297, "incorrect_loss_uncond": -18.352839787801106}, "model_output": [{"sum_logits": -21.52362060546875, "num_tokens": 13, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -38.967044830322266, "logits_per_token": -1.6556631234975963, "logits_per_char": -0.3311326246995192, "num_chars": 65}, {"sum_logits": -22.306623458862305, "num_tokens": 13, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -38.412078857421875, "logits_per_token": -1.7158941122201772, "logits_per_char": -0.3379791433160955, "num_chars": 66}, {"sum_logits": -21.152725219726562, "num_tokens": 13, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -42.6623649597168, "logits_per_token": -1.6271327092097356, "logits_per_char": -0.34117298741494456, "num_chars": 62}, {"sum_logits": -22.26010513305664, "num_tokens": 13, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -42.35227966308594, "logits_per_token": -1.7123157794658954, "logits_per_char": -0.3533350021120102, "num_chars": 63}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 437, "native_id": "NYSEDREGENTS_2012_8_19", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.803892135620117, "incorrect_loss_raw": 12.842899322509766, "correct_loss_per_char": 0.7202594757080079, "incorrect_loss_per_char": 0.7452209192163805, "correct_loss_per_token": 5.401946067810059, "incorrect_loss_per_token": 5.232621987660726, "correct_loss_uncond": -10.425344467163086, "incorrect_loss_uncond": -7.115018208821614}, "model_output": [{"sum_logits": -14.265932083129883, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -21.05502700805664, "logits_per_token": -3.5664830207824707, "logits_per_char": -0.7132966041564941, "num_chars": 20}, {"sum_logits": -12.130940437316895, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -20.406627655029297, "logits_per_token": -6.065470218658447, "logits_per_char": -0.8087293624877929, "num_chars": 15}, {"sum_logits": -10.803892135620117, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.229236602783203, "logits_per_token": -5.401946067810059, "logits_per_char": -0.7202594757080079, "num_chars": 15}, {"sum_logits": -12.13182544708252, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -18.412097930908203, "logits_per_token": -6.06591272354126, "logits_per_char": -0.7136367910048541, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 438, "native_id": "Mercury_7222460", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.267799377441406, "incorrect_loss_raw": 31.279032389322918, "correct_loss_per_char": 0.6305791536966959, "incorrect_loss_per_char": 0.5465528133350028, "correct_loss_per_token": 3.783474922180176, "incorrect_loss_per_token": 4.0964844567435135, "correct_loss_uncond": -5.160491943359375, "incorrect_loss_uncond": -12.013010660807291}, "model_output": [{"sum_logits": -30.267799377441406, "num_tokens": 8, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -35.42829132080078, "logits_per_token": -3.783474922180176, "logits_per_char": -0.6305791536966959, "num_chars": 48}, {"sum_logits": -43.99704360961914, "num_tokens": 8, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -49.54597091674805, "logits_per_token": -5.499630451202393, "logits_per_char": -0.7457126035528668, "num_chars": 59}, {"sum_logits": -18.490345001220703, "num_tokens": 8, "num_tokens_all": 243, "is_greedy": false, "sum_logits_uncond": -38.07234191894531, "logits_per_token": -2.311293125152588, "logits_per_char": -0.3133956779867916, "num_chars": 59}, {"sum_logits": -31.349708557128906, "num_tokens": 7, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -42.257816314697266, "logits_per_token": -4.478529793875558, "logits_per_char": -0.5805501584653501, "num_chars": 54}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 439, "native_id": "Mercury_7007420", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.069609642028809, "incorrect_loss_raw": 18.137896696726482, "correct_loss_per_char": 0.47092530131340027, "incorrect_loss_per_char": 0.5203858109261367, "correct_loss_per_token": 2.152801377432687, "incorrect_loss_per_token": 2.5911280995323547, "correct_loss_uncond": -25.13723087310791, "incorrect_loss_uncond": -17.73861010869344}, "model_output": [{"sum_logits": -23.012510299682617, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -37.9057502746582, "logits_per_token": -3.287501471383231, "logits_per_char": -0.6973487969600793, "num_chars": 33}, {"sum_logits": -7.592257976531982, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -27.9036865234375, "logits_per_token": -1.0846082823617118, "logits_per_char": -0.23725806176662445, "num_chars": 32}, {"sum_logits": -15.069609642028809, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -40.20684051513672, "logits_per_token": -2.152801377432687, "logits_per_char": -0.47092530131340027, "num_chars": 32}, {"sum_logits": -23.808921813964844, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -41.82008361816406, "logits_per_token": -3.4012745448521207, "logits_per_char": -0.6265505740517064, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 440, "native_id": "Mercury_SC_405710", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.497032165527344, "incorrect_loss_raw": 26.305599212646484, "correct_loss_per_char": 0.7570580618722098, "incorrect_loss_per_char": 0.9249016640678285, "correct_loss_per_token": 3.785290309361049, "incorrect_loss_per_token": 5.453041765424941, "correct_loss_uncond": -5.018749237060547, "incorrect_loss_uncond": -6.179037094116211}, "model_output": [{"sum_logits": -23.941116333007812, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -29.438695907592773, "logits_per_token": -4.7882232666015625, "logits_per_char": -0.9975465138753256, "num_chars": 24}, {"sum_logits": -28.89946174621582, "num_tokens": 4, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -35.34780502319336, "logits_per_token": -7.224865436553955, "logits_per_char": -1.032123633793422, "num_chars": 28}, {"sum_logits": -26.07621955871582, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -32.66740798950195, "logits_per_token": -4.346036593119304, "logits_per_char": -0.7450348445347377, "num_chars": 35}, {"sum_logits": -26.497032165527344, "num_tokens": 7, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -31.51578140258789, "logits_per_token": -3.785290309361049, "logits_per_char": -0.7570580618722098, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 441, "native_id": "Mercury_SC_401375", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.392179489135742, "incorrect_loss_raw": 3.86160675684611, "correct_loss_per_char": 0.6720163171941583, "incorrect_loss_per_char": 0.2493149993911622, "correct_loss_per_token": 7.392179489135742, "incorrect_loss_per_token": 2.9746317068735757, "correct_loss_uncond": -9.112604141235352, "incorrect_loss_uncond": -12.70729112625122}, "model_output": [{"sum_logits": -3.3651812076568604, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.643623352050781, "logits_per_token": -3.3651812076568604, "logits_per_char": -0.24037008626120432, "num_chars": 14}, {"sum_logits": -2.8977887630462646, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.710887908935547, "logits_per_token": -2.8977887630462646, "logits_per_char": -0.24148239692052206, "num_chars": 12}, {"sum_logits": -7.392179489135742, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.504783630371094, "logits_per_token": -7.392179489135742, "logits_per_char": -0.6720163171941583, "num_chars": 11}, {"sum_logits": -5.321850299835205, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -19.352182388305664, "logits_per_token": -2.6609251499176025, "logits_per_char": -0.26609251499176023, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 442, "native_id": "VASoL_2010_3_22", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.882373809814453, "incorrect_loss_raw": 26.011517842610676, "correct_loss_per_char": 0.5914155083733637, "incorrect_loss_per_char": 0.7151802872513627, "correct_loss_per_token": 2.4313748677571616, "incorrect_loss_per_token": 3.014815577754268, "correct_loss_uncond": -21.496578216552734, "incorrect_loss_uncond": -15.467161814371744}, "model_output": [{"sum_logits": -21.882373809814453, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -43.37895202636719, "logits_per_token": -2.4313748677571616, "logits_per_char": -0.5914155083733637, "num_chars": 37}, {"sum_logits": -26.923736572265625, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -42.4836540222168, "logits_per_token": -3.365467071533203, "logits_per_char": -0.7478815714518229, "num_chars": 36}, {"sum_logits": -21.692764282226562, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.500057220458984, "logits_per_token": -2.410307142469618, "logits_per_char": -0.6025767856174045, "num_chars": 36}, {"sum_logits": -29.418052673339844, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -42.452327728271484, "logits_per_token": -3.2686725192599826, "logits_per_char": -0.7950825046848606, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 443, "native_id": "Mercury_SC_408358", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.863243103027344, "incorrect_loss_raw": 18.851826985677082, "correct_loss_per_char": 0.3825114323542668, "incorrect_loss_per_char": 0.3089455934127217, "correct_loss_per_token": 2.2602948275479404, "incorrect_loss_per_token": 1.6804525077481927, "correct_loss_uncond": -15.098796844482422, "incorrect_loss_uncond": -12.755093892415365}, "model_output": [{"sum_logits": -24.863243103027344, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -39.962039947509766, "logits_per_token": -2.2602948275479404, "logits_per_char": -0.3825114323542668, "num_chars": 65}, {"sum_logits": -17.39595603942871, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -30.65325164794922, "logits_per_token": -1.5814505490389736, "logits_per_char": -0.30519221121804757, "num_chars": 57}, {"sum_logits": -19.761972427368164, "num_tokens": 13, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -30.76813507080078, "logits_per_token": -1.5201517251821666, "logits_per_char": -0.30878081917762756, "num_chars": 64}, {"sum_logits": -19.397552490234375, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -33.399375915527344, "logits_per_token": -1.9397552490234375, "logits_per_char": -0.31286374984248994, "num_chars": 62}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 444, "native_id": "NYSEDREGENTS_2013_8_42", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.451204299926758, "incorrect_loss_raw": 5.200283606847127, "correct_loss_per_char": 0.2876003583272298, "incorrect_loss_per_char": 0.3529268794013682, "correct_loss_per_token": 1.725602149963379, "incorrect_loss_per_token": 1.9947225968043008, "correct_loss_uncond": -14.605188369750977, "incorrect_loss_uncond": -14.790016253789267}, "model_output": [{"sum_logits": -5.6993865966796875, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -20.21866226196289, "logits_per_token": -2.8496932983398438, "logits_per_char": -0.4749488830566406, "num_chars": 12}, {"sum_logits": -3.451204299926758, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": true, "sum_logits_uncond": -18.056392669677734, "logits_per_token": -1.725602149963379, "logits_per_char": -0.2876003583272298, "num_chars": 12}, {"sum_logits": -6.054192066192627, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -22.55550765991211, "logits_per_token": -1.2108384132385255, "logits_per_char": -0.26322574200837506, "num_chars": 23}, {"sum_logits": -3.8472721576690674, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -17.19672966003418, "logits_per_token": -1.9236360788345337, "logits_per_char": -0.32060601313908893, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 445, "native_id": "Mercury_SC_400661", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.511974334716797, "incorrect_loss_raw": 14.891372044881185, "correct_loss_per_char": 0.41137720743815104, "incorrect_loss_per_char": 0.7130508371852936, "correct_loss_per_token": 1.682906757701527, "incorrect_loss_per_token": 2.6559948656294083, "correct_loss_uncond": -23.063838958740234, "incorrect_loss_uncond": -11.609832763671875}, "model_output": [{"sum_logits": -15.14971923828125, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.614299774169922, "logits_per_token": -3.7874298095703125, "logits_per_char": -1.082122802734375, "num_chars": 14}, {"sum_logits": -11.760124206542969, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -26.121793746948242, "logits_per_token": -1.9600207010904949, "logits_per_char": -0.5345511002974077, "num_chars": 22}, {"sum_logits": -17.764272689819336, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.767520904541016, "logits_per_token": -2.220534086227417, "logits_per_char": -0.5224786085240981, "num_chars": 34}, {"sum_logits": -18.511974334716797, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -41.57581329345703, "logits_per_token": -1.682906757701527, "logits_per_char": -0.41137720743815104, "num_chars": 45}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 446, "native_id": "Mercury_SC_415422", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.2930850982666, "incorrect_loss_raw": 19.566239674886067, "correct_loss_per_char": 0.44586170196533204, "incorrect_loss_per_char": 0.503199640784182, "correct_loss_per_token": 2.0266440998424184, "incorrect_loss_per_token": 2.3365245042023837, "correct_loss_uncond": -13.09046745300293, "incorrect_loss_uncond": -11.370381673177084}, "model_output": [{"sum_logits": -8.774885177612305, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.920976638793945, "logits_per_token": -1.462480862935384, "logits_per_char": -0.3988584171641957, "num_chars": 22}, {"sum_logits": -24.178787231445312, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.80023765563965, "logits_per_token": -2.686531914605035, "logits_per_char": -0.525625809379246, "num_chars": 46}, {"sum_logits": -22.2930850982666, "num_tokens": 11, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.38355255126953, "logits_per_token": -2.0266440998424184, "logits_per_char": -0.44586170196533204, "num_chars": 50}, {"sum_logits": -25.745046615600586, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -42.08864974975586, "logits_per_token": -2.860560735066732, "logits_per_char": -0.5851146958091042, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 447, "native_id": "Mercury_SC_400162", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.60728645324707, "incorrect_loss_raw": 17.641140619913738, "correct_loss_per_char": 0.2791391171907124, "incorrect_loss_per_char": 0.8345760570199229, "correct_loss_per_token": 1.3259108066558838, "incorrect_loss_per_token": 3.821302466922336, "correct_loss_uncond": -21.41533851623535, "incorrect_loss_uncond": -10.467577616373697}, "model_output": [{"sum_logits": -23.124820709228516, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -25.187959671020508, "logits_per_token": -5.781205177307129, "logits_per_char": -1.3602835711310892, "num_chars": 17}, {"sum_logits": -21.20337677001953, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.229061126708984, "logits_per_token": -3.5338961283365884, "logits_per_char": -0.7853102507414641, "num_chars": 27}, {"sum_logits": -8.595224380493164, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -28.909133911132812, "logits_per_token": -2.148806095123291, "logits_per_char": -0.35813434918721515, "num_chars": 24}, {"sum_logits": -10.60728645324707, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.02262496948242, "logits_per_token": -1.3259108066558838, "logits_per_char": -0.2791391171907124, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 448, "native_id": "Mercury_7212328", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 22.242420196533203, "incorrect_loss_raw": 27.01508967081706, "correct_loss_per_char": 0.5172655859658885, "incorrect_loss_per_char": 0.7743419547671437, "correct_loss_per_token": 2.7803025245666504, "incorrect_loss_per_token": 4.111797491709392, "correct_loss_uncond": -16.768566131591797, "incorrect_loss_uncond": -5.648070017496745}, "model_output": [{"sum_logits": -34.174293518066406, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -35.29020690917969, "logits_per_token": -4.271786689758301, "logits_per_char": -0.8993235136333265, "num_chars": 38}, {"sum_logits": -22.242420196533203, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -39.010986328125, "logits_per_token": -2.7803025245666504, "logits_per_char": -0.5172655859658885, "num_chars": 43}, {"sum_logits": -29.233104705810547, "num_tokens": 8, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -38.69837951660156, "logits_per_token": -3.6541380882263184, "logits_per_char": -0.6219809511874584, "num_chars": 47}, {"sum_logits": -17.63787078857422, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -24.000892639160156, "logits_per_token": -4.409467697143555, "logits_per_char": -0.8017213994806464, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 449, "native_id": "NCEOGA_2013_8_26", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 43.95810317993164, "incorrect_loss_raw": 25.878694534301758, "correct_loss_per_char": 0.6560910922377857, "incorrect_loss_per_char": 0.5204863093151272, "correct_loss_per_token": 3.9961911981756035, "incorrect_loss_per_token": 3.1299795423235213, "correct_loss_uncond": -13.229820251464844, "incorrect_loss_uncond": -11.527204513549805}, "model_output": [{"sum_logits": -27.66156005859375, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -41.37835693359375, "logits_per_token": -3.0735066731770835, "logits_per_char": -0.564521633848852, "num_chars": 49}, {"sum_logits": -24.056774139404297, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.85607147216797, "logits_per_token": -3.4366820199148997, "logits_per_char": -0.4454958173963759, "num_chars": 54}, {"sum_logits": -25.917749404907227, "num_tokens": 9, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.98326873779297, "logits_per_token": -2.8797499338785806, "logits_per_char": -0.5514414767001538, "num_chars": 47}, {"sum_logits": -43.95810317993164, "num_tokens": 11, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -57.187923431396484, "logits_per_token": -3.9961911981756035, "logits_per_char": -0.6560910922377857, "num_chars": 67}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 450, "native_id": "Mercury_SC_407696", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 25.696624755859375, "incorrect_loss_raw": 21.266709009806316, "correct_loss_per_char": 0.5975959245548692, "incorrect_loss_per_char": 0.47379485999230103, "correct_loss_per_token": 2.8551805284288196, "incorrect_loss_per_token": 2.5093940109172195, "correct_loss_uncond": -21.029083251953125, "incorrect_loss_uncond": -14.155961990356445}, "model_output": [{"sum_logits": -22.334407806396484, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -39.010536193847656, "logits_per_token": -3.190629686628069, "logits_per_char": -0.5726771232409354, "num_chars": 39}, {"sum_logits": -25.696624755859375, "num_tokens": 9, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -46.7257080078125, "logits_per_token": -2.8551805284288196, "logits_per_char": -0.5975959245548692, "num_chars": 43}, {"sum_logits": -17.1882381439209, "num_tokens": 9, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -30.903675079345703, "logits_per_token": -1.9098042382134333, "logits_per_char": -0.3906417759982022, "num_chars": 44}, {"sum_logits": -24.277481079101562, "num_tokens": 10, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -36.35380172729492, "logits_per_token": -2.427748107910156, "logits_per_char": -0.4580656807377653, "num_chars": 53}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 451, "native_id": "Mercury_SC_400052", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.213855743408203, "incorrect_loss_raw": 14.305787722269693, "correct_loss_per_char": 0.9475903828938802, "incorrect_loss_per_char": 0.9254320868296989, "correct_loss_per_token": 3.553463935852051, "incorrect_loss_per_token": 4.887445555792914, "correct_loss_uncond": -9.456779479980469, "incorrect_loss_uncond": -9.80895201365153}, "model_output": [{"sum_logits": -11.433182716369629, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -21.75023078918457, "logits_per_token": -5.7165913581848145, "logits_per_char": -1.0393802469426936, "num_chars": 11}, {"sum_logits": -14.213855743408203, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -23.670635223388672, "logits_per_token": -3.553463935852051, "logits_per_char": -0.9475903828938802, "num_chars": 15}, {"sum_logits": -18.587778091430664, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -27.118816375732422, "logits_per_token": -4.646944522857666, "logits_per_char": -0.9783041100752982, "num_chars": 19}, {"sum_logits": -12.896402359008789, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -23.47517204284668, "logits_per_token": -4.298800786336263, "logits_per_char": -0.7586119034711052, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 452, "native_id": "Mercury_7212870", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.656070709228516, "incorrect_loss_raw": 8.426116148630777, "correct_loss_per_char": 1.0596427917480469, "incorrect_loss_per_char": 0.8091650746085426, "correct_loss_per_token": 5.828035354614258, "incorrect_loss_per_token": 4.213058074315389, "correct_loss_uncond": -5.980522155761719, "incorrect_loss_uncond": -9.404335180918375}, "model_output": [{"sum_logits": -11.037358283996582, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -19.24686050415039, "logits_per_token": -5.518679141998291, "logits_per_char": -1.003396207636053, "num_chars": 11}, {"sum_logits": -7.698338985443115, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.352739334106445, "logits_per_token": -3.8491694927215576, "logits_per_char": -0.7698338985443115, "num_chars": 10}, {"sum_logits": -6.542651176452637, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -17.891754150390625, "logits_per_token": -3.2713255882263184, "logits_per_char": -0.6542651176452636, "num_chars": 10}, {"sum_logits": -11.656070709228516, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -17.636592864990234, "logits_per_token": -5.828035354614258, "logits_per_char": -1.0596427917480469, "num_chars": 11}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 453, "native_id": "NYSEDREGENTS_2010_8_35", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.416597366333008, "incorrect_loss_raw": 10.366411526997885, "correct_loss_per_char": 0.8012767204871545, "incorrect_loss_per_char": 0.45125535434527086, "correct_loss_per_token": 5.208298683166504, "incorrect_loss_per_token": 2.408757792578803, "correct_loss_uncond": -10.84852409362793, "incorrect_loss_uncond": -14.18945026397705}, "model_output": [{"sum_logits": -6.550637245178223, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -21.904884338378906, "logits_per_token": -1.0917728741963704, "logits_per_char": -0.17238519066258481, "num_chars": 38}, {"sum_logits": -15.362739562988281, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -31.903472900390625, "logits_per_token": -3.0725479125976562, "logits_per_char": -0.5689903541847512, "num_chars": 27}, {"sum_logits": -10.416597366333008, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -21.265121459960938, "logits_per_token": -5.208298683166504, "logits_per_char": -0.8012767204871545, "num_chars": 13}, {"sum_logits": -9.185857772827148, "num_tokens": 3, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -19.859228134155273, "logits_per_token": -3.061952590942383, "logits_per_char": -0.6123905181884766, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 454, "native_id": "MCAS_2010_8_12005", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 30.11652183532715, "incorrect_loss_raw": 44.83969243367513, "correct_loss_per_char": 0.4705706536769867, "incorrect_loss_per_char": 0.6209856430035781, "correct_loss_per_token": 2.0077681223551433, "incorrect_loss_per_token": 3.518875228034126, "correct_loss_uncond": -17.11849021911621, "incorrect_loss_uncond": -7.720024108886719}, "model_output": [{"sum_logits": -30.11652183532715, "num_tokens": 15, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -47.23501205444336, "logits_per_token": -2.0077681223551433, "logits_per_char": -0.4705706536769867, "num_chars": 64}, {"sum_logits": -35.7143440246582, "num_tokens": 12, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -42.771236419677734, "logits_per_token": -2.9761953353881836, "logits_per_char": -0.5175991887631624, "num_chars": 69}, {"sum_logits": -54.876983642578125, "num_tokens": 14, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -62.635009765625, "logits_per_token": -3.9197845458984375, "logits_per_char": -0.751739501953125, "num_chars": 73}, {"sum_logits": -43.92774963378906, "num_tokens": 12, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -52.27290344238281, "logits_per_token": -3.6606458028157554, "logits_per_char": -0.5936182382944468, "num_chars": 74}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 455, "native_id": "Mercury_7218505", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 24.44109535217285, "incorrect_loss_raw": 38.285160064697266, "correct_loss_per_char": 0.4142558534266585, "incorrect_loss_per_char": 0.6555082876362006, "correct_loss_per_token": 2.221917759288441, "incorrect_loss_per_token": 3.7013905380711414, "correct_loss_uncond": -16.981721878051758, "incorrect_loss_uncond": -10.949995676676432}, "model_output": [{"sum_logits": -41.9514045715332, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -48.61003494262695, "logits_per_token": -3.813764051957564, "logits_per_char": -0.7915359353119472, "num_chars": 53}, {"sum_logits": -24.44109535217285, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -41.42281723022461, "logits_per_token": -2.221917759288441, "logits_per_char": -0.4142558534266585, "num_chars": 59}, {"sum_logits": -41.6424560546875, "num_tokens": 10, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -47.11080551147461, "logits_per_token": -4.16424560546875, "logits_per_char": -0.6940409342447916, "num_chars": 60}, {"sum_logits": -31.261619567871094, "num_tokens": 10, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -51.98462677001953, "logits_per_token": -3.1261619567871093, "logits_per_char": -0.480947993351863, "num_chars": 65}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 456, "native_id": "Mercury_SC_400853", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.454036712646484, "incorrect_loss_raw": 17.822582880655926, "correct_loss_per_char": 0.43938182649158297, "incorrect_loss_per_char": 0.6948390638977052, "correct_loss_per_token": 2.636290958949498, "incorrect_loss_per_token": 2.9154819912380643, "correct_loss_uncond": -20.09682846069336, "incorrect_loss_uncond": -13.1240603129069}, "model_output": [{"sum_logits": -10.903665542602539, "num_tokens": 4, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.00695037841797, "logits_per_token": -2.7259163856506348, "logits_per_char": -0.5738771338211862, "num_chars": 19}, {"sum_logits": -16.800460815429688, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.702489852905273, "logits_per_token": -2.8000768025716147, "logits_per_char": -0.6222392894603588, "num_chars": 27}, {"sum_logits": -25.763622283935547, "num_tokens": 8, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -37.130489349365234, "logits_per_token": -3.2204527854919434, "logits_per_char": -0.8884007684115706, "num_chars": 29}, {"sum_logits": -18.454036712646484, "num_tokens": 7, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -38.550865173339844, "logits_per_token": -2.636290958949498, "logits_per_char": -0.43938182649158297, "num_chars": 42}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 457, "native_id": "Mercury_7210455", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.787662506103516, "incorrect_loss_raw": 25.52222951253255, "correct_loss_per_char": 0.5644639707079121, "incorrect_loss_per_char": 0.5374905604588517, "correct_loss_per_token": 2.6170602278275923, "incorrect_loss_per_token": 2.65193925080476, "correct_loss_uncond": -8.45186996459961, "incorrect_loss_uncond": -7.860439300537109}, "model_output": [{"sum_logits": -25.20821762084961, "num_tokens": 10, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -36.45234680175781, "logits_per_token": -2.520821762084961, "logits_per_char": -0.4942787768794041, "num_chars": 51}, {"sum_logits": -24.435070037841797, "num_tokens": 10, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -31.72579574584961, "logits_per_token": -2.4435070037841795, "logits_per_char": -0.5198951071881234, "num_chars": 47}, {"sum_logits": -28.787662506103516, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.239532470703125, "logits_per_token": -2.6170602278275923, "logits_per_char": -0.5644639707079121, "num_chars": 51}, {"sum_logits": -26.92340087890625, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -31.969863891601562, "logits_per_token": -2.991488986545139, "logits_per_char": -0.5982977973090278, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 458, "native_id": "Mercury_7174738", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 21.5174503326416, "incorrect_loss_raw": 29.70261510213216, "correct_loss_per_char": 0.32602197473699396, "incorrect_loss_per_char": 0.486494594607254, "correct_loss_per_token": 1.4344966888427735, "incorrect_loss_per_token": 2.1869652316101593, "correct_loss_uncond": -23.087793350219727, "incorrect_loss_uncond": -27.3302485148112}, "model_output": [{"sum_logits": -29.767704010009766, "num_tokens": 13, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -54.03627014160156, "logits_per_token": -2.2898233853853664, "logits_per_char": -0.5222404212282415, "num_chars": 57}, {"sum_logits": -30.718631744384766, "num_tokens": 13, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -57.23732376098633, "logits_per_token": -2.362971672644982, "logits_per_char": -0.5035841269571273, "num_chars": 61}, {"sum_logits": -21.5174503326416, "num_tokens": 15, "num_tokens_all": 254, "is_greedy": false, "sum_logits_uncond": -44.60524368286133, "logits_per_token": -1.4344966888427735, "logits_per_char": -0.32602197473699396, "num_chars": 66}, {"sum_logits": -28.621509552001953, "num_tokens": 15, "num_tokens_all": 254, "is_greedy": false, "sum_logits_uncond": -59.82499694824219, "logits_per_token": -1.9081006368001303, "logits_per_char": -0.43365923563639325, "num_chars": 66}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 459, "native_id": "MCAS_2001_5_2", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.62710189819336, "incorrect_loss_raw": 11.516771793365479, "correct_loss_per_char": 0.42992931459008193, "incorrect_loss_per_char": 0.38588734389736395, "correct_loss_per_token": 2.518157414027623, "incorrect_loss_per_token": 1.7640720397707017, "correct_loss_uncond": -19.629440307617188, "incorrect_loss_uncond": -16.75097354253133}, "model_output": [{"sum_logits": -12.153437614440918, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.789443969726562, "logits_per_token": -1.7362053734915597, "logits_per_char": -0.4190840556703765, "num_chars": 29}, {"sum_logits": -7.425693035125732, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -28.183494567871094, "logits_per_token": -1.0608132907322474, "logits_per_char": -0.2395384850040559, "num_chars": 31}, {"sum_logits": -14.971184730529785, "num_tokens": 6, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -28.830297470092773, "logits_per_token": -2.4951974550882974, "logits_per_char": -0.4990394910176595, "num_chars": 30}, {"sum_logits": -17.62710189819336, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -37.25654220581055, "logits_per_token": -2.518157414027623, "logits_per_char": -0.42992931459008193, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 460, "native_id": "NYSEDREGENTS_2012_4_9", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.600384712219238, "incorrect_loss_raw": 8.982191721598307, "correct_loss_per_char": 0.5428846223013741, "incorrect_loss_per_char": 0.5717384842486163, "correct_loss_per_token": 3.800192356109619, "incorrect_loss_per_token": 3.8077361848619247, "correct_loss_uncond": -14.318892478942871, "incorrect_loss_uncond": -11.27410888671875}, "model_output": [{"sum_logits": -7.600384712219238, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -21.91927719116211, "logits_per_token": -3.800192356109619, "logits_per_char": -0.5428846223013741, "num_chars": 14}, {"sum_logits": -14.646100997924805, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -22.102397918701172, "logits_per_token": -7.323050498962402, "logits_per_char": -0.9764067331949869, "num_chars": 15}, {"sum_logits": -8.15208911895752, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -17.3248348236084, "logits_per_token": -2.7173630396525064, "logits_per_char": -0.47953465405632467, "num_chars": 17}, {"sum_logits": -4.148385047912598, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -21.3416690826416, "logits_per_token": -1.382795015970866, "logits_per_char": -0.25927406549453735, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 461, "native_id": "Mercury_416593", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.2626497745513916, "incorrect_loss_raw": 4.597017685572307, "correct_loss_per_char": 0.18855414787928262, "incorrect_loss_per_char": 0.39846331042441885, "correct_loss_per_token": 2.2626497745513916, "incorrect_loss_per_token": 2.0288651386896768, "correct_loss_uncond": -15.23570990562439, "incorrect_loss_uncond": -11.391968965530396}, "model_output": [{"sum_logits": -5.982892990112305, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -15.526158332824707, "logits_per_token": -2.9914464950561523, "logits_per_char": -0.664765887790256, "num_chars": 9}, {"sum_logits": -2.2626497745513916, "num_tokens": 1, "num_tokens_all": 237, "is_greedy": true, "sum_logits_uncond": -17.49835968017578, "logits_per_token": -2.2626497745513916, "logits_per_char": -0.18855414787928262, "num_chars": 12}, {"sum_logits": -2.954573392868042, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -17.137733459472656, "logits_per_token": -1.477286696434021, "logits_per_char": -0.22727487637446478, "num_chars": 13}, {"sum_logits": -4.853586673736572, "num_tokens": 3, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -15.303068161010742, "logits_per_token": -1.6178622245788574, "logits_per_char": -0.30334916710853577, "num_chars": 16}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 462, "native_id": "Mercury_7205870", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.95395278930664, "incorrect_loss_raw": 25.074581782023113, "correct_loss_per_char": 0.7081920254615045, "incorrect_loss_per_char": 0.6525529412099891, "correct_loss_per_token": 4.390790557861328, "incorrect_loss_per_token": 3.7275595506032304, "correct_loss_uncond": -16.352657318115234, "incorrect_loss_uncond": -12.046995162963867}, "model_output": [{"sum_logits": -23.729473114013672, "num_tokens": 5, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -38.20364761352539, "logits_per_token": -4.7458946228027346, "logits_per_char": -0.6779849461146763, "num_chars": 35}, {"sum_logits": -21.95395278930664, "num_tokens": 5, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -38.306610107421875, "logits_per_token": -4.390790557861328, "logits_per_char": -0.7081920254615045, "num_chars": 31}, {"sum_logits": -21.394290924072266, "num_tokens": 8, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -35.988380432128906, "logits_per_token": -2.674286365509033, "logits_per_char": -0.5630076558966386, "num_chars": 38}, {"sum_logits": -30.0999813079834, "num_tokens": 8, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -37.17270278930664, "logits_per_token": -3.762497663497925, "logits_per_char": -0.7166662216186523, "num_chars": 42}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 463, "native_id": "Mercury_SC_401798", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.853199005126953, "incorrect_loss_raw": 20.73394775390625, "correct_loss_per_char": 0.2921181592074307, "incorrect_loss_per_char": 0.5614542237702791, "correct_loss_per_token": 1.6066498756408691, "incorrect_loss_per_token": 3.084294495759187, "correct_loss_uncond": -15.911430358886719, "incorrect_loss_uncond": -11.805979410807291}, "model_output": [{"sum_logits": -22.85793685913086, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -32.12934112548828, "logits_per_token": -3.8096561431884766, "logits_per_char": -0.7619312286376954, "num_chars": 30}, {"sum_logits": -19.290279388427734, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.255775451660156, "logits_per_token": -3.2150465647379556, "logits_per_char": -0.5213589023899388, "num_chars": 37}, {"sum_logits": -12.853199005126953, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -28.764629364013672, "logits_per_token": -1.6066498756408691, "logits_per_char": -0.2921181592074307, "num_chars": 44}, {"sum_logits": -20.053627014160156, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -38.23466491699219, "logits_per_token": -2.2281807793511286, "logits_per_char": -0.4010725402832031, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 464, "native_id": "Mercury_7084228", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.473905563354492, "incorrect_loss_raw": 12.348658561706543, "correct_loss_per_char": 0.6727785027545431, "incorrect_loss_per_char": 0.7023833626195004, "correct_loss_per_token": 3.868476390838623, "incorrect_loss_per_token": 3.4967708587646484, "correct_loss_uncond": -17.69037437438965, "incorrect_loss_uncond": -8.496760368347168}, "model_output": [{"sum_logits": -14.745823860168457, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -18.966657638549805, "logits_per_token": -4.915274620056152, "logits_per_char": -0.9830549240112305, "num_chars": 15}, {"sum_logits": -6.910520553588867, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.74396514892578, "logits_per_token": -1.7276301383972168, "logits_per_char": -0.3141145706176758, "num_chars": 22}, {"sum_logits": -15.389631271362305, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.825634002685547, "logits_per_token": -3.847407817840576, "logits_per_char": -0.809980593229595, "num_chars": 19}, {"sum_logits": -15.473905563354492, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.16427993774414, "logits_per_token": -3.868476390838623, "logits_per_char": -0.6727785027545431, "num_chars": 23}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 465, "native_id": "Mercury_417460", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.69582748413086, "incorrect_loss_raw": 15.446554183959961, "correct_loss_per_char": 1.1497681935628254, "incorrect_loss_per_char": 0.6405822037242781, "correct_loss_per_token": 3.4493045806884766, "incorrect_loss_per_token": 2.3857137538768627, "correct_loss_uncond": -4.289012908935547, "incorrect_loss_uncond": -12.510000228881836}, "model_output": [{"sum_logits": -20.69582748413086, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -24.984840393066406, "logits_per_token": -3.4493045806884766, "logits_per_char": -1.1497681935628254, "num_chars": 18}, {"sum_logits": -8.720237731933594, "num_tokens": 4, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -23.19086456298828, "logits_per_token": -2.1800594329833984, "logits_per_char": -0.43601188659667967, "num_chars": 20}, {"sum_logits": -14.348623275756836, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -24.65164566040039, "logits_per_token": -2.391437212626139, "logits_per_char": -0.6238531859024711, "num_chars": 23}, {"sum_logits": -23.270801544189453, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -36.02715301513672, "logits_per_token": -2.5856446160210504, "logits_per_char": -0.8618815386736834, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 466, "native_id": "Mercury_402539", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.14922332763672, "incorrect_loss_raw": 27.42796516418457, "correct_loss_per_char": 0.3411165052844632, "incorrect_loss_per_char": 0.45372073272425517, "correct_loss_per_token": 1.9226566661487927, "incorrect_loss_per_token": 2.6312825735971983, "correct_loss_uncond": -24.272537231445312, "incorrect_loss_uncond": -18.238593419392902}, "model_output": [{"sum_logits": -26.673067092895508, "num_tokens": 11, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -47.45568084716797, "logits_per_token": -2.424824281172319, "logits_per_char": -0.43726339496550015, "num_chars": 61}, {"sum_logits": -21.14922332763672, "num_tokens": 11, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -45.42176055908203, "logits_per_token": -1.9226566661487927, "logits_per_char": -0.3411165052844632, "num_chars": 62}, {"sum_logits": -25.558469772338867, "num_tokens": 12, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -41.10528564453125, "logits_per_token": -2.129872481028239, "logits_per_char": -0.38724954200513434, "num_chars": 66}, {"sum_logits": -30.052358627319336, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -48.4387092590332, "logits_per_token": -3.3391509585910373, "logits_per_char": -0.536649261202131, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 467, "native_id": "Mercury_406800", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.134754180908203, "incorrect_loss_raw": 21.712391535441082, "correct_loss_per_char": 0.49012849137589737, "incorrect_loss_per_char": 0.5721486023262436, "correct_loss_per_token": 2.590679168701172, "incorrect_loss_per_token": 3.3889895938691637, "correct_loss_uncond": -18.492366790771484, "incorrect_loss_uncond": -17.37811342875163}, "model_output": [{"sum_logits": -22.320594787597656, "num_tokens": 8, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -40.539852142333984, "logits_per_token": -2.790074348449707, "logits_per_char": -0.4749062720765459, "num_chars": 47}, {"sum_logits": -18.134754180908203, "num_tokens": 7, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -36.62712097167969, "logits_per_token": -2.590679168701172, "logits_per_char": -0.49012849137589737, "num_chars": 37}, {"sum_logits": -20.76237678527832, "num_tokens": 7, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -38.49036407470703, "logits_per_token": -2.9660538264683316, "logits_per_char": -0.5063994337872761, "num_chars": 41}, {"sum_logits": -22.054203033447266, "num_tokens": 5, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -38.24129867553711, "logits_per_token": -4.410840606689453, "logits_per_char": -0.7351401011149089, "num_chars": 30}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 468, "native_id": "Mercury_SC_408321", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.029281616210938, "incorrect_loss_raw": 13.878218332926432, "correct_loss_per_char": 0.30636893378363717, "incorrect_loss_per_char": 0.4919106233587245, "correct_loss_per_token": 1.5756116594587053, "incorrect_loss_per_token": 2.468911404079861, "correct_loss_uncond": -13.076671600341797, "incorrect_loss_uncond": -9.570542017618815}, "model_output": [{"sum_logits": -11.029281616210938, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -24.105953216552734, "logits_per_token": -1.5756116594587053, "logits_per_char": -0.30636893378363717, "num_chars": 36}, {"sum_logits": -13.891918182373047, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -24.921234130859375, "logits_per_token": -2.3153196970621743, "logits_per_char": -0.47903166146113957, "num_chars": 29}, {"sum_logits": -13.713985443115234, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -22.288516998291016, "logits_per_token": -2.2856642405192056, "logits_per_char": -0.45713284810384114, "num_chars": 30}, {"sum_logits": -14.028751373291016, "num_tokens": 5, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -23.13652992248535, "logits_per_token": -2.805750274658203, "logits_per_char": -0.5395673605111929, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 469, "native_id": "Mercury_SC_406836", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.186355590820312, "incorrect_loss_raw": 28.239311854044598, "correct_loss_per_char": 0.6708079020182292, "incorrect_loss_per_char": 0.620876068964737, "correct_loss_per_token": 4.312336512974331, "incorrect_loss_per_token": 2.6528470877445103, "correct_loss_uncond": -13.892650604248047, "incorrect_loss_uncond": -17.179611841837566}, "model_output": [{"sum_logits": -29.413867950439453, "num_tokens": 11, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -48.950050354003906, "logits_per_token": -2.6739879954944956, "logits_per_char": -0.6684969988736239, "num_chars": 44}, {"sum_logits": -27.04388427734375, "num_tokens": 11, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -46.285118103027344, "logits_per_token": -2.458534934303977, "logits_per_char": -0.6289275413335755, "num_chars": 43}, {"sum_logits": -30.186355590820312, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -44.07900619506836, "logits_per_token": -4.312336512974331, "logits_per_char": -0.6708079020182292, "num_chars": 45}, {"sum_logits": -28.260183334350586, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -41.021602630615234, "logits_per_token": -2.8260183334350586, "logits_per_char": -0.5652036666870117, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 470, "native_id": "Mercury_SC_410963", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.14591407775879, "incorrect_loss_raw": 17.586137135823567, "correct_loss_per_char": 0.4320455732799712, "incorrect_loss_per_char": 0.5304630573597776, "correct_loss_per_token": 1.814591407775879, "incorrect_loss_per_token": 2.1653531392415366, "correct_loss_uncond": -13.692060470581055, "incorrect_loss_uncond": -11.34423573811849}, "model_output": [{"sum_logits": -13.492752075195312, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.43431282043457, "logits_per_token": -1.9275360107421875, "logits_per_char": -0.49973155834056715, "num_chars": 27}, {"sum_logits": -14.979007720947266, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.456018447875977, "logits_per_token": -2.1398582458496094, "logits_per_char": -0.49930025736490885, "num_chars": 30}, {"sum_logits": -24.286651611328125, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -33.900787353515625, "logits_per_token": -2.4286651611328125, "logits_per_char": -0.5923573563738567, "num_chars": 41}, {"sum_logits": -18.14591407775879, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.837974548339844, "logits_per_token": -1.814591407775879, "logits_per_char": -0.4320455732799712, "num_chars": 42}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 471, "native_id": "Mercury_7132405", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.014524459838867, "incorrect_loss_raw": 18.083678881327312, "correct_loss_per_char": 0.4852886199951172, "incorrect_loss_per_char": 0.624911001750401, "correct_loss_per_token": 2.6690874099731445, "incorrect_loss_per_token": 2.8204777240753174, "correct_loss_uncond": -16.404211044311523, "incorrect_loss_uncond": -11.955263455708822}, "model_output": [{"sum_logits": -15.527539253234863, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -26.835817337036133, "logits_per_token": -2.587923208872477, "logits_per_char": -0.5545549733298165, "num_chars": 28}, {"sum_logits": -24.793746948242188, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -36.013160705566406, "logits_per_token": -4.132291158040364, "logits_per_char": -0.9536056518554688, "num_chars": 26}, {"sum_logits": -13.929750442504883, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.26784896850586, "logits_per_token": -1.7412188053131104, "logits_per_char": -0.36657238006591797, "num_chars": 38}, {"sum_logits": -16.014524459838867, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -32.41873550415039, "logits_per_token": -2.6690874099731445, "logits_per_char": -0.4852886199951172, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 472, "native_id": "Mercury_SC_408872", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.446263313293457, "incorrect_loss_raw": 17.5938237508138, "correct_loss_per_char": 0.5171639735882099, "incorrect_loss_per_char": 0.696365458368021, "correct_loss_per_token": 1.9208947590419225, "incorrect_loss_per_token": 3.327869923909505, "correct_loss_uncond": -12.016951560974121, "incorrect_loss_uncond": -9.808921178181967}, "model_output": [{"sum_logits": -17.18053436279297, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -27.615020751953125, "logits_per_token": -2.863422393798828, "logits_per_char": -0.7469797549040421, "num_chars": 23}, {"sum_logits": -16.48607063293457, "num_tokens": 5, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -28.19525146484375, "logits_per_token": -3.297214126586914, "logits_per_char": -0.6594428253173829, "num_chars": 25}, {"sum_logits": -13.446263313293457, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -25.463214874267578, "logits_per_token": -1.9208947590419225, "logits_per_char": -0.5171639735882099, "num_chars": 26}, {"sum_logits": -19.114866256713867, "num_tokens": 5, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -26.39796257019043, "logits_per_token": -3.8229732513427734, "logits_per_char": -0.6826737948826381, "num_chars": 28}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 473, "native_id": "VASoL_2008_3_25", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.594388484954834, "incorrect_loss_raw": 8.552146911621094, "correct_loss_per_char": 0.3496492803096771, "incorrect_loss_per_char": 0.4938193383040252, "correct_loss_per_token": 1.8647961616516113, "incorrect_loss_per_token": 2.1380367279052734, "correct_loss_uncond": -16.50644540786743, "incorrect_loss_uncond": -9.743278503417969}, "model_output": [{"sum_logits": -8.078428268432617, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.374767303466797, "logits_per_token": -2.0196070671081543, "logits_per_char": -0.5049017667770386, "num_chars": 16}, {"sum_logits": -5.594388484954834, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.100833892822266, "logits_per_token": -1.8647961616516113, "logits_per_char": -0.3496492803096771, "num_chars": 16}, {"sum_logits": -7.348724365234375, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.025527954101562, "logits_per_token": -1.8371810913085938, "logits_per_char": -0.4082624647352431, "num_chars": 18}, {"sum_logits": -10.229288101196289, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.485980987548828, "logits_per_token": -2.5573220252990723, "logits_per_char": -0.5682937833997939, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 474, "native_id": "WASL_2005_8_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.4295654296875, "incorrect_loss_raw": 32.722238540649414, "correct_loss_per_char": 0.24613934252635541, "incorrect_loss_per_char": 0.4911259968811601, "correct_loss_per_token": 1.2017391429227942, "incorrect_loss_per_token": 2.3528631422254773, "correct_loss_uncond": -42.03787612915039, "incorrect_loss_uncond": -23.92266019185384}, "model_output": [{"sum_logits": -20.4295654296875, "num_tokens": 17, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -62.46744155883789, "logits_per_token": -1.2017391429227942, "logits_per_char": -0.24613934252635541, "num_chars": 83}, {"sum_logits": -25.514047622680664, "num_tokens": 15, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -57.14888000488281, "logits_per_token": -1.700936508178711, "logits_per_char": -0.3865764791315252, "num_chars": 66}, {"sum_logits": -41.80416488647461, "num_tokens": 15, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -63.04781723022461, "logits_per_token": -2.7869443257649738, "logits_per_char": -0.5726597929654056, "num_chars": 73}, {"sum_logits": -30.84850311279297, "num_tokens": 12, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -49.737998962402344, "logits_per_token": -2.5707085927327475, "logits_per_char": -0.5141417185465494, "num_chars": 60}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 475, "native_id": "AKDE&ED_2012_8_20", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.425858497619629, "incorrect_loss_raw": 16.229352951049805, "correct_loss_per_char": 0.5712929248809815, "incorrect_loss_per_char": 0.7315365971059092, "correct_loss_per_token": 2.8564646244049072, "incorrect_loss_per_token": 3.272201659187438, "correct_loss_uncond": -15.083565711975098, "incorrect_loss_uncond": -17.636660893758137}, "model_output": [{"sum_logits": -12.643606185913086, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -25.72171401977539, "logits_per_token": -4.214535395304362, "logits_per_char": -0.9031147275652204, "num_chars": 14}, {"sum_logits": -11.425858497619629, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -26.509424209594727, "logits_per_token": -2.8564646244049072, "logits_per_char": -0.5712929248809815, "num_chars": 20}, {"sum_logits": -19.020206451416016, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -27.551010131835938, "logits_per_token": -3.170034408569336, "logits_per_char": -0.7044520907931857, "num_chars": 27}, {"sum_logits": -17.024246215820312, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -48.3253173828125, "logits_per_token": -2.432035173688616, "logits_per_char": -0.5870429729593212, "num_chars": 29}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 476, "native_id": "Mercury_7056823", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.104754447937012, "incorrect_loss_raw": 11.563414891560873, "correct_loss_per_char": 0.42710594030526966, "incorrect_loss_per_char": 0.7300307392004525, "correct_loss_per_token": 2.776188611984253, "incorrect_loss_per_token": 3.2111564212375217, "correct_loss_uncond": -15.377171516418457, "incorrect_loss_uncond": -12.242077509562174}, "model_output": [{"sum_logits": -10.405549049377441, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -20.78760528564453, "logits_per_token": -3.4685163497924805, "logits_per_char": -0.7432535035269601, "num_chars": 14}, {"sum_logits": -11.104754447937012, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.48192596435547, "logits_per_token": -2.776188611984253, "logits_per_char": -0.42710594030526966, "num_chars": 26}, {"sum_logits": -14.474592208862305, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.323299407958984, "logits_per_token": -2.894918441772461, "logits_per_char": -0.6293300960374915, "num_chars": 23}, {"sum_logits": -9.810103416442871, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.305572509765625, "logits_per_token": -3.2700344721476235, "logits_per_char": -0.8175086180369059, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 477, "native_id": "Mercury_7205800", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 46.342796325683594, "incorrect_loss_raw": 38.060089111328125, "correct_loss_per_char": 0.7241061925888062, "incorrect_loss_per_char": 0.5256638965567482, "correct_loss_per_token": 4.63427963256836, "incorrect_loss_per_token": 3.628791854036376, "correct_loss_uncond": -5.379695892333984, "incorrect_loss_uncond": -19.741814931233723}, "model_output": [{"sum_logits": -38.683082580566406, "num_tokens": 11, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -63.41490173339844, "logits_per_token": -3.5166438709605825, "logits_per_char": -0.5951243473933293, "num_chars": 65}, {"sum_logits": -46.342796325683594, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -51.72249221801758, "logits_per_token": -4.63427963256836, "logits_per_char": -0.7241061925888062, "num_chars": 64}, {"sum_logits": -38.81878662109375, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -49.301368713378906, "logits_per_token": -4.313198513454861, "logits_per_char": -0.5175838216145834, "num_chars": 75}, {"sum_logits": -36.67839813232422, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -60.6894416809082, "logits_per_token": -3.056533177693685, "logits_per_char": -0.46428352066233186, "num_chars": 79}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 478, "native_id": "Mercury_SC_402282", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.3117799758911133, "incorrect_loss_raw": 6.5363186200459795, "correct_loss_per_char": 0.2759816646575928, "incorrect_loss_per_char": 0.3644856298000206, "correct_loss_per_token": 1.6558899879455566, "incorrect_loss_per_token": 2.290592670440674, "correct_loss_uncond": -13.031886100769043, "incorrect_loss_uncond": -10.34674628575643}, "model_output": [{"sum_logits": -3.3117799758911133, "num_tokens": 2, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.343666076660156, "logits_per_token": -1.6558899879455566, "logits_per_char": -0.2759816646575928, "num_chars": 12}, {"sum_logits": -2.01275634765625, "num_tokens": 2, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.160932540893555, "logits_per_token": -1.006378173828125, "logits_per_char": -0.15482741135817307, "num_chars": 13}, {"sum_logits": -13.316509246826172, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -18.368919372558594, "logits_per_token": -4.438836415608724, "logits_per_char": -0.7008689077276933, "num_chars": 19}, {"sum_logits": -4.279690265655518, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -17.119342803955078, "logits_per_token": -1.4265634218851726, "logits_per_char": -0.23776057031419542, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 479, "native_id": "MCAS_1998_8_26", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.558493614196777, "incorrect_loss_raw": 15.917376200358072, "correct_loss_per_char": 0.5984036005460299, "incorrect_loss_per_char": 0.5453579202633158, "correct_loss_per_token": 3.8896234035491943, "incorrect_loss_per_token": 2.702541987101237, "correct_loss_uncond": -17.108803749084473, "incorrect_loss_uncond": -12.901702245076498}, "model_output": [{"sum_logits": -13.026222229003906, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -26.302642822265625, "logits_per_token": -3.2565555572509766, "logits_per_char": -0.6202962966192336, "num_chars": 21}, {"sum_logits": -22.477935791015625, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -33.56047058105469, "logits_per_token": -2.809741973876953, "logits_per_char": -0.6075117781355575, "num_chars": 37}, {"sum_logits": -15.558493614196777, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -32.66729736328125, "logits_per_token": -3.8896234035491943, "logits_per_char": -0.5984036005460299, "num_chars": 26}, {"sum_logits": -12.247970581054688, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.5941219329834, "logits_per_token": -2.0413284301757812, "logits_per_char": -0.40826568603515623, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 480, "native_id": "Mercury_7230318", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.471408367156982, "incorrect_loss_raw": 7.9255744616190595, "correct_loss_per_char": 0.22357041835784913, "incorrect_loss_per_char": 0.45804407497896343, "correct_loss_per_token": 2.235704183578491, "incorrect_loss_per_token": 3.9627872308095298, "correct_loss_uncond": -10.272650241851807, "incorrect_loss_uncond": -9.40058978398641}, "model_output": [{"sum_logits": -0.5843377113342285, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": true, "sum_logits_uncond": -14.031314849853516, "logits_per_token": -0.29216885566711426, "logits_per_char": -0.0389558474222819, "num_chars": 15}, {"sum_logits": -8.898970603942871, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -15.87733268737793, "logits_per_token": -4.4494853019714355, "logits_per_char": -0.49438725577460396, "num_chars": 18}, {"sum_logits": -14.293415069580078, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -22.06984519958496, "logits_per_token": -7.146707534790039, "logits_per_char": -0.8407891217400046, "num_chars": 17}, {"sum_logits": -4.471408367156982, "num_tokens": 2, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -14.744058609008789, "logits_per_token": -2.235704183578491, "logits_per_char": -0.22357041835784913, "num_chars": 20}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 481, "native_id": "Mercury_SC_416167", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.421854019165039, "incorrect_loss_raw": 3.847980340321859, "correct_loss_per_char": 0.6843708038330079, "incorrect_loss_per_char": 0.5744411283069187, "correct_loss_per_token": 3.421854019165039, "incorrect_loss_per_token": 3.847980340321859, "correct_loss_uncond": -9.325366973876953, "incorrect_loss_uncond": -9.032578309377035}, "model_output": [{"sum_logits": -4.81600284576416, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.092914581298828, "logits_per_token": -4.81600284576416, "logits_per_char": -0.60200035572052, "num_chars": 8}, {"sum_logits": -3.0049822330474854, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.46006965637207, "logits_per_token": -3.0049822330474854, "logits_per_char": -0.5008303721745809, "num_chars": 6}, {"sum_logits": -3.421854019165039, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.747220993041992, "logits_per_token": -3.421854019165039, "logits_per_char": -0.6843708038330079, "num_chars": 5}, {"sum_logits": -3.7229559421539307, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.088691711425781, "logits_per_token": -3.7229559421539307, "logits_per_char": -0.6204926570256551, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 482, "native_id": "Mercury_7027720", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.049827575683594, "incorrect_loss_raw": 18.69350306193034, "correct_loss_per_char": 0.5732081277029855, "incorrect_loss_per_char": 0.6067850921893941, "correct_loss_per_token": 2.674971262613932, "incorrect_loss_per_token": 2.5720245623714706, "correct_loss_uncond": -15.525522232055664, "incorrect_loss_uncond": -12.758125305175781}, "model_output": [{"sum_logits": -15.80679702758789, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -29.300525665283203, "logits_per_token": -2.6344661712646484, "logits_per_char": -0.6322718811035156, "num_chars": 25}, {"sum_logits": -16.049827575683594, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -31.575349807739258, "logits_per_token": -2.674971262613932, "logits_per_char": -0.5732081277029855, "num_chars": 28}, {"sum_logits": -19.11264419555664, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -31.690223693847656, "logits_per_token": -2.730377742222377, "logits_per_char": -0.6590566963985048, "num_chars": 29}, {"sum_logits": -21.161067962646484, "num_tokens": 9, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -33.3641357421875, "logits_per_token": -2.351229773627387, "logits_per_char": -0.5290266990661621, "num_chars": 40}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 483, "native_id": "LEAP__5_10312", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.476057052612305, "incorrect_loss_raw": 10.670599460601807, "correct_loss_per_char": 0.353169043858846, "incorrect_loss_per_char": 0.491639268586207, "correct_loss_per_token": 1.412676175435384, "incorrect_loss_per_token": 2.0488946967654758, "correct_loss_uncond": -13.877386093139648, "incorrect_loss_uncond": -11.872782548268637}, "model_output": [{"sum_logits": -7.670267581939697, "num_tokens": 6, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -21.1864013671875, "logits_per_token": -1.278377930323283, "logits_per_char": -0.31959448258082074, "num_chars": 24}, {"sum_logits": -8.476057052612305, "num_tokens": 6, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -22.353443145751953, "logits_per_token": -1.412676175435384, "logits_per_char": -0.353169043858846, "num_chars": 24}, {"sum_logits": -13.744729042053223, "num_tokens": 5, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -23.939842224121094, "logits_per_token": -2.7489458084106446, "logits_per_char": -0.5975969148718793, "num_chars": 23}, {"sum_logits": -10.5968017578125, "num_tokens": 5, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -22.503902435302734, "logits_per_token": -2.1193603515625, "logits_per_char": -0.557726408305921, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 484, "native_id": "Mercury_405161", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.667799949645996, "incorrect_loss_raw": 12.275507291158041, "correct_loss_per_char": 0.43944545225663617, "incorrect_loss_per_char": 0.6387127968302945, "correct_loss_per_token": 2.416949987411499, "incorrect_loss_per_token": 2.3617920239766437, "correct_loss_uncond": -18.572470664978027, "incorrect_loss_uncond": -13.822260220845541}, "model_output": [{"sum_logits": -13.515939712524414, "num_tokens": 5, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -26.96503257751465, "logits_per_token": -2.703187942504883, "logits_per_char": -0.7113652480276007, "num_chars": 19}, {"sum_logits": -14.91273307800293, "num_tokens": 5, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -29.20635986328125, "logits_per_token": -2.9825466156005858, "logits_per_char": -0.7848806883159437, "num_chars": 19}, {"sum_logits": -8.397849082946777, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -22.121910095214844, "logits_per_token": -1.399641513824463, "logits_per_char": -0.4198924541473389, "num_chars": 20}, {"sum_logits": -9.667799949645996, "num_tokens": 4, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -28.240270614624023, "logits_per_token": -2.416949987411499, "logits_per_char": -0.43944545225663617, "num_chars": 22}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 485, "native_id": "Mercury_SC_409245", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.494792938232422, "incorrect_loss_raw": 13.885808308919271, "correct_loss_per_char": 0.41218934979355126, "incorrect_loss_per_char": 0.41592776246846014, "correct_loss_per_token": 2.1358902671120386, "incorrect_loss_per_token": 2.165434413486057, "correct_loss_uncond": -16.370201110839844, "incorrect_loss_uncond": -12.999757766723633}, "model_output": [{"sum_logits": -9.354864120483398, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -22.629432678222656, "logits_per_token": -1.5591440200805664, "logits_per_char": -0.3341022900172642, "num_chars": 28}, {"sum_logits": -13.54532241821289, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.974332809448242, "logits_per_token": -2.257553736368815, "logits_per_char": -0.4670800833866514, "num_chars": 29}, {"sum_logits": -18.757238388061523, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -31.052932739257812, "logits_per_token": -2.679605484008789, "logits_per_char": -0.44660091400146484, "num_chars": 42}, {"sum_logits": -23.494792938232422, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -39.864994049072266, "logits_per_token": -2.1358902671120386, "logits_per_char": -0.41218934979355126, "num_chars": 57}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 486, "native_id": "ACTAAP_2011_5_8", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.608781814575195, "incorrect_loss_raw": 5.767828464508057, "correct_loss_per_char": 0.5083678318903997, "incorrect_loss_per_char": 0.48667914602491597, "correct_loss_per_token": 3.3043909072875977, "incorrect_loss_per_token": 3.6729880968729653, "correct_loss_uncond": -10.127338409423828, "incorrect_loss_uncond": -9.35164467493693}, "model_output": [{"sum_logits": -3.952345848083496, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -12.413293838500977, "logits_per_token": -3.952345848083496, "logits_per_char": -0.43914953867594403, "num_chars": 9}, {"sum_logits": -3.9243578910827637, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -13.756929397583008, "logits_per_token": -3.9243578910827637, "logits_per_char": -0.39243578910827637, "num_chars": 10}, {"sum_logits": -6.608781814575195, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -16.736120223999023, "logits_per_token": -3.3043909072875977, "logits_per_char": -0.5083678318903997, "num_chars": 13}, {"sum_logits": -9.42678165435791, "num_tokens": 3, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -19.188196182250977, "logits_per_token": -3.1422605514526367, "logits_per_char": -0.6284521102905274, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 487, "native_id": "Mercury_7223370", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.583137035369873, "incorrect_loss_raw": 8.24897050857544, "correct_loss_per_char": 0.5728921294212341, "incorrect_loss_per_char": 1.017289261338572, "correct_loss_per_token": 4.583137035369873, "incorrect_loss_per_token": 5.860954205195109, "correct_loss_uncond": -11.01296854019165, "incorrect_loss_uncond": -6.432167847951253}, "model_output": [{"sum_logits": -9.366683959960938, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -16.218828201293945, "logits_per_token": -4.683341979980469, "logits_per_char": -1.040742662217882, "num_chars": 9}, {"sum_logits": -10.418813705444336, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -12.891420364379883, "logits_per_token": -10.418813705444336, "logits_per_char": -1.302351713180542, "num_chars": 8}, {"sum_logits": -4.961413860321045, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -14.93316650390625, "logits_per_token": -2.4807069301605225, "logits_per_char": -0.7087734086172921, "num_chars": 7}, {"sum_logits": -4.583137035369873, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.596105575561523, "logits_per_token": -4.583137035369873, "logits_per_char": -0.5728921294212341, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 488, "native_id": "Mercury_SC_400697", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.915231704711914, "incorrect_loss_raw": 24.4540220896403, "correct_loss_per_char": 0.375894037882487, "incorrect_loss_per_char": 0.5402391051470419, "correct_loss_per_token": 1.8794701894124348, "incorrect_loss_per_token": 2.8746358417329336, "correct_loss_uncond": -19.69569969177246, "incorrect_loss_uncond": -13.765334447224935}, "model_output": [{"sum_logits": -17.59218406677246, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.16346740722656, "logits_per_token": -2.513169152396066, "logits_per_char": -0.4629522122834858, "num_chars": 38}, {"sum_logits": -16.915231704711914, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -36.610931396484375, "logits_per_token": -1.8794701894124348, "logits_per_char": -0.375894037882487, "num_chars": 45}, {"sum_logits": -21.350006103515625, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.33848571777344, "logits_per_token": -2.668750762939453, "logits_per_char": -0.5083334786551339, "num_chars": 42}, {"sum_logits": -34.41987609863281, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -47.1561164855957, "logits_per_token": -3.4419876098632813, "logits_per_char": -0.6494316245025059, "num_chars": 53}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 489, "native_id": "Mercury_SC_401262", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.40372085571289, "incorrect_loss_raw": 11.332596778869629, "correct_loss_per_char": 0.4728964025324041, "incorrect_loss_per_char": 0.7290234178559393, "correct_loss_per_token": 5.201860427856445, "incorrect_loss_per_token": 5.6662983894348145, "correct_loss_uncond": -8.024063110351562, "incorrect_loss_uncond": -5.116362889607747}, "model_output": [{"sum_logits": -11.543392181396484, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -17.41792869567871, "logits_per_token": -5.771696090698242, "logits_per_char": -0.8879532447228065, "num_chars": 13}, {"sum_logits": -10.583826065063477, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -16.37534523010254, "logits_per_token": -5.291913032531738, "logits_per_char": -0.7055884043375651, "num_chars": 15}, {"sum_logits": -11.870572090148926, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.553605079650879, "logits_per_token": -5.935286045074463, "logits_per_char": -0.5935286045074463, "num_chars": 20}, {"sum_logits": -10.40372085571289, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -18.427783966064453, "logits_per_token": -5.201860427856445, "logits_per_char": -0.4728964025324041, "num_chars": 22}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 490, "native_id": "Mercury_7136063", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.8809967041015625, "incorrect_loss_raw": 8.991739273071289, "correct_loss_per_char": 0.37528555733816965, "incorrect_loss_per_char": 0.41443129304634274, "correct_loss_per_token": 2.6269989013671875, "incorrect_loss_per_token": 2.3063462363349068, "correct_loss_uncond": -15.449703216552734, "incorrect_loss_uncond": -14.583120981852213}, "model_output": [{"sum_logits": -9.952590942382812, "num_tokens": 5, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -27.079364776611328, "logits_per_token": -1.9905181884765626, "logits_per_char": -0.36861447934751157, "num_chars": 27}, {"sum_logits": -7.8809967041015625, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -23.330699920654297, "logits_per_token": -2.6269989013671875, "logits_per_char": -0.37528555733816965, "num_chars": 21}, {"sum_logits": -8.074365615844727, "num_tokens": 3, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -21.078752517700195, "logits_per_token": -2.6914552052815757, "logits_per_char": -0.4037182807922363, "num_chars": 20}, {"sum_logits": -8.948261260986328, "num_tokens": 4, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -22.566463470458984, "logits_per_token": -2.237065315246582, "logits_per_char": -0.47096111899928045, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 491, "native_id": "Mercury_405876", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.565608978271484, "incorrect_loss_raw": 11.928536732991537, "correct_loss_per_char": 0.6232002720688329, "incorrect_loss_per_char": 0.4160484151002244, "correct_loss_per_token": 3.4276014963785806, "incorrect_loss_per_token": 1.7998109015207442, "correct_loss_uncond": -13.694332122802734, "incorrect_loss_uncond": -15.873753229777018}, "model_output": [{"sum_logits": -12.062512397766113, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -25.899490356445312, "logits_per_token": -2.010418732961019, "logits_per_char": -0.5482960180802778, "num_chars": 22}, {"sum_logits": -16.96786880493164, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -31.824871063232422, "logits_per_token": -2.423981257847377, "logits_per_char": -0.45859104878193624, "num_chars": 37}, {"sum_logits": -6.7552289962768555, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -25.68250846862793, "logits_per_token": -0.9650327137538365, "logits_per_char": -0.24125817843845912, "num_chars": 28}, {"sum_logits": -20.565608978271484, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -34.25994110107422, "logits_per_token": -3.4276014963785806, "logits_per_char": -0.6232002720688329, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 492, "native_id": "Mercury_7057890", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.346717834472656, "incorrect_loss_raw": 17.21532980600993, "correct_loss_per_char": 0.7644465764363607, "incorrect_loss_per_char": 0.47839084095597095, "correct_loss_per_token": 3.6693435668945313, "incorrect_loss_per_token": 2.5562127506922163, "correct_loss_uncond": -9.45260238647461, "incorrect_loss_uncond": -5.0173845291137695}, "model_output": [{"sum_logits": -18.346717834472656, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.799320220947266, "logits_per_token": -3.6693435668945313, "logits_per_char": -0.7644465764363607, "num_chars": 24}, {"sum_logits": -17.66090202331543, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -23.0601806640625, "logits_per_token": -2.5229860033307756, "logits_per_char": -0.5351788491913767, "num_chars": 33}, {"sum_logits": -21.778217315673828, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.63882064819336, "logits_per_token": -3.1111739022391185, "logits_per_char": -0.5185289837065197, "num_chars": 42}, {"sum_logits": -12.206870079040527, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -15.999141693115234, "logits_per_token": -2.0344783465067544, "logits_per_char": -0.3814646899700165, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 493, "native_id": "LEAP_2002_4_10247", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.684979438781738, "incorrect_loss_raw": 13.28084945678711, "correct_loss_per_char": 0.3216659051400644, "incorrect_loss_per_char": 0.4363941131453184, "correct_loss_per_token": 1.4474965731302898, "incorrect_loss_per_token": 1.8972642081124442, "correct_loss_uncond": -10.89928913116455, "incorrect_loss_uncond": -7.276936848958333}, "model_output": [{"sum_logits": -8.60682487487793, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -18.457138061523438, "logits_per_token": -1.229546410696847, "logits_per_char": -0.27763951209283644, "num_chars": 31}, {"sum_logits": -8.684979438781738, "num_tokens": 6, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -19.58426856994629, "logits_per_token": -1.4474965731302898, "logits_per_char": -0.3216659051400644, "num_chars": 27}, {"sum_logits": -17.145254135131836, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -22.11202621459961, "logits_per_token": -2.449322019304548, "logits_per_char": -0.5912156598321323, "num_chars": 29}, {"sum_logits": -14.090469360351562, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -21.10419464111328, "logits_per_token": -2.0129241943359375, "logits_per_char": -0.44032716751098633, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 494, "native_id": "Mercury_SC_405481", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.74399185180664, "incorrect_loss_raw": 13.720505714416504, "correct_loss_per_char": 0.40599966049194336, "incorrect_loss_per_char": 0.7574569525542083, "correct_loss_per_token": 1.948798370361328, "incorrect_loss_per_token": 2.7891805436876087, "correct_loss_uncond": -19.336515426635742, "incorrect_loss_uncond": -10.992461840311686}, "model_output": [{"sum_logits": -13.70193862915039, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -20.71100616455078, "logits_per_token": -3.4254846572875977, "logits_per_char": -0.9134625752766927, "num_chars": 15}, {"sum_logits": -10.96381664276123, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.56667709350586, "logits_per_token": -2.192763328552246, "logits_per_char": -0.6091009245978461, "num_chars": 18}, {"sum_logits": -9.74399185180664, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.080507278442383, "logits_per_token": -1.948798370361328, "logits_per_char": -0.40599966049194336, "num_chars": 24}, {"sum_logits": -16.49576187133789, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -29.86121940612793, "logits_per_token": -2.749293645222982, "logits_per_char": -0.7498073577880859, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 495, "native_id": "Mercury_SC_400401", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.428943634033203, "incorrect_loss_raw": 17.43845208485921, "correct_loss_per_char": 0.8012584188710088, "incorrect_loss_per_char": 0.794529086748759, "correct_loss_per_token": 3.0714906056722007, "incorrect_loss_per_token": 4.092673810323079, "correct_loss_uncond": -15.84200668334961, "incorrect_loss_uncond": -10.039334932963053}, "model_output": [{"sum_logits": -21.23931884765625, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.553386688232422, "logits_per_token": -4.24786376953125, "logits_per_char": -1.11785888671875, "num_chars": 19}, {"sum_logits": -18.428943634033203, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -34.27095031738281, "logits_per_token": -3.0714906056722007, "logits_per_char": -0.8012584188710088, "num_chars": 23}, {"sum_logits": -17.463911056518555, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.125526428222656, "logits_per_token": -3.492782211303711, "logits_per_char": -0.6985564422607422, "num_chars": 25}, {"sum_logits": -13.612126350402832, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.75444793701172, "logits_per_token": -4.537375450134277, "logits_per_char": -0.5671719312667847, "num_chars": 24}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 496, "native_id": "Mercury_7064260", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.750670433044434, "incorrect_loss_raw": 14.552406946818033, "correct_loss_per_char": 0.17064556242927673, "incorrect_loss_per_char": 0.2366531430635792, "correct_loss_per_token": 1.1945189370049372, "incorrect_loss_per_token": 1.6169341052020039, "correct_loss_uncond": -22.313328742980957, "incorrect_loss_uncond": -19.768771489461262}, "model_output": [{"sum_logits": -15.705787658691406, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -35.475791931152344, "logits_per_token": -1.7450875176323786, "logits_per_char": -0.25747192883100667, "num_chars": 61}, {"sum_logits": -10.750670433044434, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -33.06399917602539, "logits_per_token": -1.1945189370049372, "logits_per_char": -0.17064556242927673, "num_chars": 63}, {"sum_logits": -16.936019897460938, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -34.374691009521484, "logits_per_token": -1.8817799886067708, "logits_per_char": -0.2776396704501793, "num_chars": 61}, {"sum_logits": -11.015413284301758, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -33.11305236816406, "logits_per_token": -1.223934809366862, "logits_per_char": -0.17484782990955172, "num_chars": 63}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 497, "native_id": "Mercury_7015995", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 0.9761692881584167, "incorrect_loss_raw": 5.369194666544597, "correct_loss_per_char": 0.09761692881584168, "incorrect_loss_per_char": 0.6473638702321936, "correct_loss_per_token": 0.9761692881584167, "incorrect_loss_per_token": 4.512917995452881, "correct_loss_uncond": -12.21393233537674, "incorrect_loss_uncond": -9.87795607248942}, "model_output": [{"sum_logits": -5.137660026550293, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.662814140319824, "logits_per_token": -2.5688300132751465, "logits_per_char": -0.5708511140611436, "num_chars": 9}, {"sum_logits": -0.9761692881584167, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": true, "sum_logits_uncond": -13.190101623535156, "logits_per_token": -0.9761692881584167, "logits_per_char": -0.09761692881584168, "num_chars": 10}, {"sum_logits": -3.224456310272217, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.469865798950195, "logits_per_token": -3.224456310272217, "logits_per_char": -0.4030570387840271, "num_chars": 8}, {"sum_logits": -7.745467662811279, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.608772277832031, "logits_per_token": -7.745467662811279, "logits_per_char": -0.9681834578514099, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 498, "native_id": "Mercury_400887", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.158167839050293, "incorrect_loss_raw": 11.99700673421224, "correct_loss_per_char": 1.4511668341500419, "incorrect_loss_per_char": 1.0796996979486375, "correct_loss_per_token": 2.5395419597625732, "incorrect_loss_per_token": 1.7661658922831218, "correct_loss_uncond": -12.104926109313965, "incorrect_loss_uncond": -22.613914489746094}, "model_output": [{"sum_logits": -13.529855728149414, "num_tokens": 9, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -39.18479919433594, "logits_per_token": -1.5033173031277127, "logits_per_char": -0.9664182662963867, "num_chars": 14}, {"sum_logits": -10.158167839050293, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -22.263093948364258, "logits_per_token": -2.5395419597625732, "logits_per_char": -1.4511668341500419, "num_chars": 7}, {"sum_logits": -13.10479736328125, "num_tokens": 9, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -42.352500915527344, "logits_per_token": -1.4560885959201388, "logits_per_char": -0.9360569545200893, "num_chars": 14}, {"sum_logits": -9.356367111206055, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -22.29546356201172, "logits_per_token": -2.3390917778015137, "logits_per_char": -1.3366238730294364, "num_chars": 7}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 499, "native_id": "Mercury_7247678", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 31.35198211669922, "incorrect_loss_raw": 32.7081241607666, "correct_loss_per_char": 0.4898747205734253, "incorrect_loss_per_char": 0.5110644400119781, "correct_loss_per_token": 2.8501801924272017, "incorrect_loss_per_token": 2.9734658327969634, "correct_loss_uncond": -6.957038879394531, "incorrect_loss_uncond": -5.985513687133789}, "model_output": [{"sum_logits": -31.35198211669922, "num_tokens": 11, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -38.30902099609375, "logits_per_token": -2.8501801924272017, "logits_per_char": -0.4898747205734253, "num_chars": 64}, {"sum_logits": -32.459999084472656, "num_tokens": 11, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -39.653480529785156, "logits_per_token": -2.9509090076793325, "logits_per_char": -0.5071874856948853, "num_chars": 64}, {"sum_logits": -33.810455322265625, "num_tokens": 11, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -36.927101135253906, "logits_per_token": -3.073677756569602, "logits_per_char": -0.5282883644104004, "num_chars": 64}, {"sum_logits": -31.853918075561523, "num_tokens": 11, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -39.50033187866211, "logits_per_token": -2.8958107341419566, "logits_per_char": -0.4977174699306488, "num_chars": 64}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 500, "native_id": "MDSA_2007_8_24", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.032322883605957, "incorrect_loss_raw": 15.119120279947916, "correct_loss_per_char": 0.2947514290903129, "incorrect_loss_per_char": 0.293256569118473, "correct_loss_per_token": 1.5032322883605957, "incorrect_loss_per_token": 1.8094233671824138, "correct_loss_uncond": -15.614348411560059, "incorrect_loss_uncond": -13.20791244506836}, "model_output": [{"sum_logits": -9.792291641235352, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -22.457298278808594, "logits_per_token": -1.224036455154419, "logits_per_char": -0.20834663066458195, "num_chars": 47}, {"sum_logits": -17.380800247192383, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -31.952171325683594, "logits_per_token": -1.9312000274658203, "logits_per_char": -0.3408000048469095, "num_chars": 51}, {"sum_logits": -15.032322883605957, "num_tokens": 10, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -30.646671295166016, "logits_per_token": -1.5032322883605957, "logits_per_char": -0.2947514290903129, "num_chars": 51}, {"sum_logits": -18.184268951416016, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -30.57162857055664, "logits_per_token": -2.273033618927002, "logits_per_char": -0.33062307184392753, "num_chars": 55}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 501, "native_id": "AKDE&ED_2008_8_48", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.932893753051758, "incorrect_loss_raw": 17.62364451090495, "correct_loss_per_char": 0.3886071647085795, "incorrect_loss_per_char": 0.488651642321903, "correct_loss_per_token": 2.276127679007394, "incorrect_loss_per_token": 3.49895696185884, "correct_loss_uncond": -14.009664535522461, "incorrect_loss_uncond": -12.960077285766602}, "model_output": [{"sum_logits": -15.932893753051758, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.94255828857422, "logits_per_token": -2.276127679007394, "logits_per_char": -0.3886071647085795, "num_chars": 41}, {"sum_logits": -15.264347076416016, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -30.130504608154297, "logits_per_token": -3.052869415283203, "logits_per_char": -0.4625559720126065, "num_chars": 33}, {"sum_logits": -18.27135467529297, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.56996726989746, "logits_per_token": -2.610193525041853, "logits_per_char": -0.48082512303402547, "num_chars": 38}, {"sum_logits": -19.33523178100586, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -30.05069351196289, "logits_per_token": -4.833807945251465, "logits_per_char": -0.5225738319190772, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 502, "native_id": "Mercury_401014", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.578514099121094, "incorrect_loss_raw": 12.210608959197998, "correct_loss_per_char": 0.2296519423976089, "incorrect_loss_per_char": 0.350497917695479, "correct_loss_per_token": 1.0826448713030135, "incorrect_loss_per_token": 1.684915256500244, "correct_loss_uncond": -16.06998062133789, "incorrect_loss_uncond": -16.75599018732707}, "model_output": [{"sum_logits": -6.438725471496582, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -25.316957473754883, "logits_per_token": -1.2877450942993165, "logits_per_char": -0.24764328736525315, "num_chars": 26}, {"sum_logits": -7.476905345916748, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -25.715045928955078, "logits_per_token": -1.4953810691833496, "logits_per_char": -0.28757328253525954, "num_chars": 26}, {"sum_logits": -7.578514099121094, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -23.648494720458984, "logits_per_token": -1.0826448713030135, "logits_per_char": -0.2296519423976089, "num_chars": 33}, {"sum_logits": -22.716196060180664, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -35.867794036865234, "logits_per_token": -2.2716196060180662, "logits_per_char": -0.5162771831859242, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 503, "native_id": "Mercury_7106698", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.344621181488037, "incorrect_loss_raw": 5.748876730600993, "correct_loss_per_char": 0.4344621181488037, "incorrect_loss_per_char": 0.5226251573273629, "correct_loss_per_token": 4.344621181488037, "incorrect_loss_per_token": 4.8772501945495605, "correct_loss_uncond": -8.72908639907837, "incorrect_loss_uncond": -9.565123716990152}, "model_output": [{"sum_logits": -4.344621181488037, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.073707580566406, "logits_per_token": -4.344621181488037, "logits_per_char": -0.4344621181488037, "num_chars": 10}, {"sum_logits": -6.326333045959473, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.626222610473633, "logits_per_token": -6.326333045959473, "logits_per_char": -0.5751211859963157, "num_chars": 11}, {"sum_logits": -5.229759216308594, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.679563522338867, "logits_per_token": -2.614879608154297, "logits_per_char": -0.47543265602805396, "num_chars": 11}, {"sum_logits": -5.690537929534912, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.636215209960938, "logits_per_token": -5.690537929534912, "logits_per_char": -0.5173216299577192, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 504, "native_id": "Mercury_7143308", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.224071502685547, "incorrect_loss_raw": 18.106959025065105, "correct_loss_per_char": 0.4784464306301541, "incorrect_loss_per_char": 0.40413144846532667, "correct_loss_per_token": 3.4448143005371095, "incorrect_loss_per_token": 2.1909209798883507, "correct_loss_uncond": -12.041006088256836, "incorrect_loss_uncond": -17.95180384318034}, "model_output": [{"sum_logits": -15.648962020874023, "num_tokens": 9, "num_tokens_all": 242, "is_greedy": false, "sum_logits_uncond": -36.15616226196289, "logits_per_token": -1.7387735578748915, "logits_per_char": -0.38168200050912254, "num_chars": 41}, {"sum_logits": -17.224071502685547, "num_tokens": 5, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -29.265077590942383, "logits_per_token": -3.4448143005371095, "logits_per_char": -0.4784464306301541, "num_chars": 36}, {"sum_logits": -17.091997146606445, "num_tokens": 8, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -29.44628143310547, "logits_per_token": -2.1364996433258057, "logits_per_char": -0.3715651553610097, "num_chars": 46}, {"sum_logits": -21.579917907714844, "num_tokens": 8, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -42.57384490966797, "logits_per_token": -2.6974897384643555, "logits_per_char": -0.45914718952584777, "num_chars": 47}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 505, "native_id": "MCAS_2005_9_21", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.612039566040039, "incorrect_loss_raw": 11.064455032348633, "correct_loss_per_char": 1.768673261006673, "incorrect_loss_per_char": 2.2293827056884763, "correct_loss_per_token": 5.3060197830200195, "incorrect_loss_per_token": 5.532227516174316, "correct_loss_uncond": -9.763200759887695, "incorrect_loss_uncond": -6.137085914611816}, "model_output": [{"sum_logits": -12.56488037109375, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -20.22576141357422, "logits_per_token": -6.282440185546875, "logits_per_char": -2.094146728515625, "num_chars": 6}, {"sum_logits": -10.612039566040039, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -20.375240325927734, "logits_per_token": -5.3060197830200195, "logits_per_char": -1.768673261006673, "num_chars": 6}, {"sum_logits": -11.262395858764648, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -17.710430145263672, "logits_per_token": -5.631197929382324, "logits_per_char": -2.2524791717529298, "num_chars": 5}, {"sum_logits": -9.3660888671875, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -13.668431282043457, "logits_per_token": -4.68304443359375, "logits_per_char": -2.341522216796875, "num_chars": 4}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 506, "native_id": "Mercury_400443", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.324735641479492, "incorrect_loss_raw": 17.840298970540363, "correct_loss_per_char": 0.4032825168810393, "incorrect_loss_per_char": 0.4694815518563254, "correct_loss_per_token": 1.7027484046088324, "incorrect_loss_per_token": 1.9822554411711515, "correct_loss_uncond": -19.023210525512695, "incorrect_loss_uncond": -19.10236994425456}, "model_output": [{"sum_logits": -16.916603088378906, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -35.89115905761719, "logits_per_token": -1.879622565375434, "logits_per_char": -0.44517376548365545, "num_chars": 38}, {"sum_logits": -15.324735641479492, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -34.34794616699219, "logits_per_token": -1.7027484046088324, "logits_per_char": -0.4032825168810393, "num_chars": 38}, {"sum_logits": -17.707988739013672, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -38.244503021240234, "logits_per_token": -1.9675543043348525, "logits_per_char": -0.46599970365825455, "num_chars": 38}, {"sum_logits": -18.896305084228516, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -36.692344665527344, "logits_per_token": -2.0995894538031683, "logits_per_char": -0.4972711864270662, "num_chars": 38}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 507, "native_id": "Mercury_7283430", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.578128814697266, "incorrect_loss_raw": 13.717022260030111, "correct_loss_per_char": 0.3022322518484933, "incorrect_loss_per_char": 0.3989315095290639, "correct_loss_per_token": 1.3222661018371582, "incorrect_loss_per_token": 1.5814033084445531, "correct_loss_uncond": -6.618618011474609, "incorrect_loss_uncond": -11.474177996317545}, "model_output": [{"sum_logits": -12.374580383300781, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -23.381694793701172, "logits_per_token": -1.5468225479125977, "logits_per_char": -0.37498728434244794, "num_chars": 33}, {"sum_logits": -10.578128814697266, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -17.196746826171875, "logits_per_token": -1.3222661018371582, "logits_per_char": -0.3022322518484933, "num_chars": 35}, {"sum_logits": -13.74576473236084, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.86116600036621, "logits_per_token": -1.5273071924845378, "logits_per_char": -0.4042871980106129, "num_chars": 34}, {"sum_logits": -15.030721664428711, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -24.330739974975586, "logits_per_token": -1.6700801849365234, "logits_per_char": -0.41752004623413086, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 508, "native_id": "Mercury_7159250", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.845694541931152, "incorrect_loss_raw": 7.207592964172363, "correct_loss_per_char": 0.3259854543776739, "incorrect_loss_per_char": 0.3741755494734649, "correct_loss_per_token": 2.2818981806437173, "incorrect_loss_per_token": 2.4475800593694053, "correct_loss_uncond": -23.028611183166504, "incorrect_loss_uncond": -16.34107240041097}, "model_output": [{"sum_logits": -7.7481818199157715, "num_tokens": 2, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -19.63015365600586, "logits_per_token": -3.8740909099578857, "logits_per_char": -0.455775401171516, "num_chars": 17}, {"sum_logits": -6.93823766708374, "num_tokens": 4, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -24.448978424072266, "logits_per_token": -1.734559416770935, "logits_per_char": -0.36517040353072316, "num_chars": 19}, {"sum_logits": -6.845694541931152, "num_tokens": 3, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -29.874305725097656, "logits_per_token": -2.2818981806437173, "logits_per_char": -0.3259854543776739, "num_chars": 21}, {"sum_logits": -6.936359405517578, "num_tokens": 4, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -26.566864013671875, "logits_per_token": -1.7340898513793945, "logits_per_char": -0.3015808437181556, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 509, "native_id": "Mercury_401912", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.131322383880615, "incorrect_loss_raw": 3.640999714533488, "correct_loss_per_char": 2.043774127960205, "incorrect_loss_per_char": 0.8255428989728292, "correct_loss_per_token": 3.0656611919403076, "incorrect_loss_per_token": 1.820499857266744, "correct_loss_uncond": -4.120952129364014, "incorrect_loss_uncond": -6.068959633509318}, "model_output": [{"sum_logits": -6.131322383880615, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -10.252274513244629, "logits_per_token": -3.0656611919403076, "logits_per_char": -2.043774127960205, "num_chars": 3}, {"sum_logits": -3.439789295196533, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -10.773286819458008, "logits_per_token": -1.7198946475982666, "logits_per_char": -0.8599473237991333, "num_chars": 4}, {"sum_logits": -2.4007880687713623, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -9.886903762817383, "logits_per_token": -1.2003940343856812, "logits_per_char": -0.6001970171928406, "num_chars": 4}, {"sum_logits": -5.082421779632568, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -8.469687461853027, "logits_per_token": -2.541210889816284, "logits_per_char": -1.0164843559265138, "num_chars": 5}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 510, "native_id": "Mercury_7219328", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.498193740844727, "incorrect_loss_raw": 15.169326464335123, "correct_loss_per_char": 0.5587172788732192, "incorrect_loss_per_char": 0.7846433570961547, "correct_loss_per_token": 2.3745484352111816, "incorrect_loss_per_token": 3.202716554914202, "correct_loss_uncond": -15.592363357543945, "incorrect_loss_uncond": -8.71874205271403}, "model_output": [{"sum_logits": -7.608813285827637, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -17.985713958740234, "logits_per_token": -1.0869733265468053, "logits_per_char": -0.44757725210750804, "num_chars": 17}, {"sum_logits": -9.498193740844727, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.090557098388672, "logits_per_token": -2.3745484352111816, "logits_per_char": -0.5587172788732192, "num_chars": 17}, {"sum_logits": -18.826862335205078, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.682157516479492, "logits_per_token": -4.7067155838012695, "logits_per_char": -0.7844525973002116, "num_chars": 24}, {"sum_logits": -19.072303771972656, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -25.996334075927734, "logits_per_token": -3.8144607543945312, "logits_per_char": -1.1219002218807446, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 511, "native_id": "Mercury_7214498", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.021331787109375, "incorrect_loss_raw": 3.8324852784474692, "correct_loss_per_char": 0.3776664733886719, "incorrect_loss_per_char": 0.5402727410906837, "correct_loss_per_token": 1.5106658935546875, "incorrect_loss_per_token": 2.5674760739008584, "correct_loss_uncond": -9.770746231079102, "incorrect_loss_uncond": -7.7399609088897705}, "model_output": [{"sum_logits": -3.9470160007476807, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -9.714378356933594, "logits_per_token": -1.9735080003738403, "logits_per_char": -0.6578360001246134, "num_chars": 6}, {"sum_logits": -3.907400608062744, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -11.095708847045898, "logits_per_token": -3.907400608062744, "logits_per_char": -0.5582000868661063, "num_chars": 7}, {"sum_logits": -3.021331787109375, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.792078018188477, "logits_per_token": -1.5106658935546875, "logits_per_char": -0.3776664733886719, "num_chars": 8}, {"sum_logits": -3.6430392265319824, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.907251358032227, "logits_per_token": -1.8215196132659912, "logits_per_char": -0.40478213628133136, "num_chars": 9}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 512, "native_id": "TAKS_2009_5_14", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.32747745513916, "incorrect_loss_raw": 18.179081598917644, "correct_loss_per_char": 0.5116956233978271, "incorrect_loss_per_char": 0.7089354597787251, "correct_loss_per_token": 2.38791290918986, "incorrect_loss_per_token": 3.3860915501912436, "correct_loss_uncond": -8.319926261901855, "incorrect_loss_uncond": -8.114161173502604}, "model_output": [{"sum_logits": -14.32747745513916, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -22.647403717041016, "logits_per_token": -2.38791290918986, "logits_per_char": -0.5116956233978271, "num_chars": 28}, {"sum_logits": -22.475229263305664, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -27.941591262817383, "logits_per_token": -3.7458715438842773, "logits_per_char": -0.7750079056312298, "num_chars": 29}, {"sum_logits": -12.708272933959961, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -24.568096160888672, "logits_per_token": -2.541654586791992, "logits_per_char": -0.5776487697254528, "num_chars": 22}, {"sum_logits": -19.353742599487305, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.370040893554688, "logits_per_token": -3.870748519897461, "logits_per_char": -0.7741497039794922, "num_chars": 25}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 513, "native_id": "NYSEDREGENTS_2013_4_17", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.0602030754089355, "incorrect_loss_raw": 7.29874849319458, "correct_loss_per_char": 0.8657432964869908, "incorrect_loss_per_char": 1.3734322161901564, "correct_loss_per_token": 3.0301015377044678, "incorrect_loss_per_token": 6.214974880218506, "correct_loss_uncond": -8.521863460540771, "incorrect_loss_uncond": -4.4375379880269366}, "model_output": [{"sum_logits": -7.508966445922852, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -11.172590255737305, "logits_per_token": -7.508966445922852, "logits_per_char": -1.877241611480713, "num_chars": 4}, {"sum_logits": -7.884637355804443, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -7.884637355804443, "logits_per_char": -1.3141062259674072, "num_chars": 6}, {"sum_logits": -6.502641677856445, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.1472749710083, "logits_per_token": -3.2513208389282227, "logits_per_char": -0.9289488111223493, "num_chars": 7}, {"sum_logits": -6.0602030754089355, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.582066535949707, "logits_per_token": -3.0301015377044678, "logits_per_char": -0.8657432964869908, "num_chars": 7}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 514, "native_id": "Mercury_403907", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.826484680175781, "incorrect_loss_raw": 21.24547513326009, "correct_loss_per_char": 0.34666174811285894, "incorrect_loss_per_char": 0.5646785901023132, "correct_loss_per_token": 1.6033105850219727, "incorrect_loss_per_token": 2.4882409307691784, "correct_loss_uncond": -15.107646942138672, "incorrect_loss_uncond": -19.013667424519856}, "model_output": [{"sum_logits": -27.56863784790039, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -46.657676696777344, "logits_per_token": -3.446079730987549, "logits_per_char": -0.7068881499461639, "num_chars": 39}, {"sum_logits": -20.31719398498535, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -39.89264678955078, "logits_per_token": -2.2574659983317056, "logits_per_char": -0.5209536919227014, "num_chars": 39}, {"sum_logits": -12.826484680175781, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.934131622314453, "logits_per_token": -1.6033105850219727, "logits_per_char": -0.34666174811285894, "num_chars": 37}, {"sum_logits": -15.850593566894531, "num_tokens": 9, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -34.22710418701172, "logits_per_token": -1.7611770629882812, "logits_per_char": -0.46619392843807445, "num_chars": 34}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 515, "native_id": "Mercury_7081480", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.700111389160156, "incorrect_loss_raw": 11.942030111948648, "correct_loss_per_char": 0.24166976081000435, "incorrect_loss_per_char": 0.3512570374384616, "correct_loss_per_token": 1.4500185648600261, "incorrect_loss_per_token": 1.9257310579693507, "correct_loss_uncond": -25.236949920654297, "incorrect_loss_uncond": -23.063754240671795}, "model_output": [{"sum_logits": -8.700111389160156, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.93706130981445, "logits_per_token": -1.4500185648600261, "logits_per_char": -0.24166976081000435, "num_chars": 36}, {"sum_logits": -18.888797760009766, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -38.97303771972656, "logits_per_token": -2.698399680001395, "logits_per_char": -0.49707362526341486, "num_chars": 38}, {"sum_logits": -9.259950637817383, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -34.6551513671875, "logits_per_token": -1.5433251063028972, "logits_per_char": -0.2723514893475701, "num_chars": 34}, {"sum_logits": -7.677341938018799, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -31.389163970947266, "logits_per_token": -1.5354683876037598, "logits_per_char": -0.28434599770439994, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 516, "native_id": "Mercury_416505", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.045940399169922, "incorrect_loss_raw": 16.450143178304035, "correct_loss_per_char": 0.6685808499654134, "incorrect_loss_per_char": 0.5900150097627691, "correct_loss_per_token": 2.674323399861654, "incorrect_loss_per_token": 2.6023628643580845, "correct_loss_uncond": -11.580318450927734, "incorrect_loss_uncond": -12.83004633585612}, "model_output": [{"sum_logits": -18.01386260986328, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -27.558095932006836, "logits_per_token": -3.602772521972656, "logits_per_char": -0.8188119368119673, "num_chars": 22}, {"sum_logits": -16.045940399169922, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -27.626258850097656, "logits_per_token": -2.674323399861654, "logits_per_char": -0.6685808499654134, "num_chars": 24}, {"sum_logits": -16.085731506347656, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -31.18636131286621, "logits_per_token": -2.297961643763951, "logits_per_char": -0.5026791095733643, "num_chars": 32}, {"sum_logits": -15.250835418701172, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -29.096111297607422, "logits_per_token": -1.9063544273376465, "logits_per_char": -0.44855398290297566, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 517, "native_id": "Mercury_7041668", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.706783294677734, "incorrect_loss_raw": 15.834161758422852, "correct_loss_per_char": 0.4074825193823838, "incorrect_loss_per_char": 0.515949123585386, "correct_loss_per_token": 2.784463882446289, "incorrect_loss_per_token": 2.838854747348362, "correct_loss_uncond": -15.747367858886719, "incorrect_loss_uncond": -11.461643854777018}, "model_output": [{"sum_logits": -13.000955581665039, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -22.27662467956543, "logits_per_token": -2.1668259302775064, "logits_per_char": -0.5000367531409631, "num_chars": 26}, {"sum_logits": -17.984500885009766, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -30.284778594970703, "logits_per_token": -3.5969001770019533, "logits_per_char": -0.5620156526565552, "num_chars": 32}, {"sum_logits": -16.51702880859375, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.326013565063477, "logits_per_token": -2.752838134765625, "logits_per_char": -0.4857949649586397, "num_chars": 34}, {"sum_logits": -16.706783294677734, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.45415115356445, "logits_per_token": -2.784463882446289, "logits_per_char": -0.4074825193823838, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 518, "native_id": "Mercury_SC_401309", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.993061065673828, "incorrect_loss_raw": 5.063428004582723, "correct_loss_per_char": 0.33287073771158854, "incorrect_loss_per_char": 0.4127821941224355, "correct_loss_per_token": 1.248265266418457, "incorrect_loss_per_token": 2.2345965107282004, "correct_loss_uncond": -13.25840950012207, "incorrect_loss_uncond": -12.748145341873169}, "model_output": [{"sum_logits": -9.653462409973145, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -15.328177452087402, "logits_per_token": -4.826731204986572, "logits_per_char": -0.8044552008310953, "num_chars": 12}, {"sum_logits": -1.9714117050170898, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.232770919799805, "logits_per_token": -0.9857058525085449, "logits_per_char": -0.17921924591064453, "num_chars": 11}, {"sum_logits": -4.993061065673828, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -18.2514705657959, "logits_per_token": -1.248265266418457, "logits_per_char": -0.33287073771158854, "num_chars": 15}, {"sum_logits": -3.5654098987579346, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -20.87377166748047, "logits_per_token": -0.8913524746894836, "logits_per_char": -0.25467213562556673, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 519, "native_id": "NYSEDREGENTS_2010_4_1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.3838582038879395, "incorrect_loss_raw": 6.8624467849731445, "correct_loss_per_char": 0.2819881836573283, "incorrect_loss_per_char": 1.2638994322882757, "correct_loss_per_token": 3.3838582038879395, "incorrect_loss_per_token": 6.8624467849731445, "correct_loss_uncond": -10.310630321502686, "incorrect_loss_uncond": -5.1591488520304365}, "model_output": [{"sum_logits": -9.773093223571777, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -11.46108627319336, "logits_per_token": -9.773093223571777, "logits_per_char": -1.6288488705952961, "num_chars": 6}, {"sum_logits": -3.3838582038879395, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.694488525390625, "logits_per_token": -3.3838582038879395, "logits_per_char": -0.2819881836573283, "num_chars": 12}, {"sum_logits": -3.610079288482666, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.611124038696289, "logits_per_token": -3.610079288482666, "logits_per_char": -0.7220158576965332, "num_chars": 5}, {"sum_logits": -7.20416784286499, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -11.992576599121094, "logits_per_token": -7.20416784286499, "logits_per_char": -1.440833568572998, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 520, "native_id": "ACTAAP_2007_7_36", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.037841320037842, "incorrect_loss_raw": 4.01999568939209, "correct_loss_per_char": 0.839640220006307, "incorrect_loss_per_char": 0.6140745957692464, "correct_loss_per_token": 5.037841320037842, "incorrect_loss_per_token": 3.0383098125457764, "correct_loss_uncond": -7.018682956695557, "incorrect_loss_uncond": -8.901803016662598}, "model_output": [{"sum_logits": -2.3481297492980957, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -11.566265106201172, "logits_per_token": -2.3481297492980957, "logits_per_char": -0.5870324373245239, "num_chars": 4}, {"sum_logits": -3.821742057800293, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -3.821742057800293, "logits_per_char": -0.7643484115600586, "num_chars": 5}, {"sum_logits": -5.037841320037842, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.056524276733398, "logits_per_token": -5.037841320037842, "logits_per_char": -0.839640220006307, "num_chars": 6}, {"sum_logits": -5.890115261077881, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.735466003417969, "logits_per_token": -2.9450576305389404, "logits_per_char": -0.49084293842315674, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 521, "native_id": "VASoL_2009_3_12", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 14.02048397064209, "incorrect_loss_raw": 16.39530849456787, "correct_loss_per_char": 0.4005852563040597, "incorrect_loss_per_char": 0.5243325722806248, "correct_loss_per_token": 2.0029262815202986, "incorrect_loss_per_token": 2.366748995251126, "correct_loss_uncond": -17.318385124206543, "incorrect_loss_uncond": -14.26311206817627}, "model_output": [{"sum_logits": -18.670757293701172, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -34.93920135498047, "logits_per_token": -3.111792882283529, "logits_per_char": -0.6915095293963397, "num_chars": 27}, {"sum_logits": -9.747252464294434, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -21.785465240478516, "logits_per_token": -1.3924646377563477, "logits_per_char": -0.30460163950920105, "num_chars": 32}, {"sum_logits": -20.767915725708008, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.25059509277344, "logits_per_token": -2.595989465713501, "logits_per_char": -0.5768865479363335, "num_chars": 36}, {"sum_logits": -14.02048397064209, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -31.338869094848633, "logits_per_token": -2.0029262815202986, "logits_per_char": -0.4005852563040597, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 522, "native_id": "Mercury_7085295", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.521850109100342, "incorrect_loss_raw": 5.456279277801514, "correct_loss_per_char": 0.5869750181833903, "incorrect_loss_per_char": 0.8568110314626542, "correct_loss_per_token": 1.760925054550171, "incorrect_loss_per_token": 2.0412373542785645, "correct_loss_uncond": -11.451645374298096, "incorrect_loss_uncond": -12.292101383209229}, "model_output": [{"sum_logits": -5.740566253662109, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -18.710708618164062, "logits_per_token": -1.9135220845540364, "logits_per_char": -0.9567610422770182, "num_chars": 6}, {"sum_logits": -4.004596710205078, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -15.378349304199219, "logits_per_token": -2.002298355102539, "logits_per_char": -0.6674327850341797, "num_chars": 6}, {"sum_logits": -3.521850109100342, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.973495483398438, "logits_per_token": -1.760925054550171, "logits_per_char": -0.5869750181833903, "num_chars": 6}, {"sum_logits": -6.6236748695373535, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -19.156084060668945, "logits_per_token": -2.2078916231791177, "logits_per_char": -0.9462392670767648, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 523, "native_id": "Mercury_7201968", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.59046173095703, "incorrect_loss_raw": 22.085955301920574, "correct_loss_per_char": 0.5331346193949381, "incorrect_loss_per_char": 0.410705776789464, "correct_loss_per_token": 4.265076955159505, "incorrect_loss_per_token": 2.8805199350629533, "correct_loss_uncond": -12.653213500976562, "incorrect_loss_uncond": -15.42076555887858}, "model_output": [{"sum_logits": -25.59046173095703, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -38.243675231933594, "logits_per_token": -4.265076955159505, "logits_per_char": -0.5331346193949381, "num_chars": 48}, {"sum_logits": -14.557046890258789, "num_tokens": 8, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -30.18547248840332, "logits_per_token": -1.8196308612823486, "logits_per_char": -0.2911409378051758, "num_chars": 50}, {"sum_logits": -20.12228775024414, "num_tokens": 7, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -33.62840270996094, "logits_per_token": -2.8746125357491628, "logits_per_char": -0.38696707212007964, "num_chars": 52}, {"sum_logits": -31.57853126525879, "num_tokens": 8, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -48.7062873840332, "logits_per_token": -3.9473164081573486, "logits_per_char": -0.5540093204431367, "num_chars": 57}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 524, "native_id": "Mercury_7214008", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 34.62678527832031, "incorrect_loss_raw": 25.275896072387695, "correct_loss_per_char": 0.5410435199737549, "incorrect_loss_per_char": 0.5379215389213449, "correct_loss_per_token": 2.663598867563101, "incorrect_loss_per_token": 2.7540304398639894, "correct_loss_uncond": -7.539176940917969, "incorrect_loss_uncond": -15.662569046020508}, "model_output": [{"sum_logits": -28.699209213256836, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -43.47930145263672, "logits_per_token": -2.8699209213256838, "logits_per_char": -0.531466837282534, "num_chars": 54}, {"sum_logits": -21.32444190979004, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -40.948795318603516, "logits_per_token": -3.04634884425572, "logits_per_char": -0.5763362678321632, "num_chars": 37}, {"sum_logits": -34.62678527832031, "num_tokens": 13, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -42.16596221923828, "logits_per_token": -2.663598867563101, "logits_per_char": -0.5410435199737549, "num_chars": 64}, {"sum_logits": -25.80403709411621, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -38.387298583984375, "logits_per_token": -2.3458215540105645, "logits_per_char": -0.5059615116493374, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 525, "native_id": "Mercury_176855", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 2.7229466438293457, "incorrect_loss_raw": 8.390804767608643, "correct_loss_per_char": 0.3403683304786682, "incorrect_loss_per_char": 0.8117259102638322, "correct_loss_per_token": 2.7229466438293457, "incorrect_loss_per_token": 5.981046915054321, "correct_loss_uncond": -10.167706966400146, "incorrect_loss_uncond": -5.662901719411214}, "model_output": [{"sum_logits": -11.58485221862793, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.0220947265625, "logits_per_token": -5.792426109313965, "logits_per_char": -1.05316838351163, "num_chars": 11}, {"sum_logits": -2.7229466438293457, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -12.890653610229492, "logits_per_token": -2.7229466438293457, "logits_per_char": -0.3403683304786682, "num_chars": 8}, {"sum_logits": -10.7138671875, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -14.885438919067383, "logits_per_token": -10.7138671875, "logits_per_char": -1.1904296875, "num_chars": 9}, {"sum_logits": -2.873694896697998, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -1.436847448348999, "logits_per_char": -0.19157965977986655, "num_chars": 15}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 526, "native_id": "Mercury_SC_401678", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 36.90894317626953, "incorrect_loss_raw": 21.129283905029297, "correct_loss_per_char": 1.190611070202243, "incorrect_loss_per_char": 0.6063894820703161, "correct_loss_per_token": 5.272706168038504, "incorrect_loss_per_token": 3.15365491594587, "correct_loss_uncond": -2.98223876953125, "incorrect_loss_uncond": -13.69183095296224}, "model_output": [{"sum_logits": -17.033409118652344, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -27.175670623779297, "logits_per_token": -2.8389015197753906, "logits_per_char": -0.5873589351259428, "num_chars": 29}, {"sum_logits": -36.90894317626953, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.89118194580078, "logits_per_token": -5.272706168038504, "logits_per_char": -1.190611070202243, "num_chars": 31}, {"sum_logits": -24.206859588623047, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.2166748046875, "logits_per_token": -3.458122798374721, "logits_per_char": -0.6916245596749442, "num_chars": 35}, {"sum_logits": -22.1475830078125, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -41.07099914550781, "logits_per_token": -3.1639404296875, "logits_per_char": -0.540184951410061, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 527, "native_id": "Mercury_417143", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.0308713912963867, "incorrect_loss_raw": 3.3181583086649575, "correct_loss_per_char": 0.33847856521606445, "incorrect_loss_per_char": 0.5228483608790807, "correct_loss_per_token": 2.0308713912963867, "incorrect_loss_per_token": 3.3181583086649575, "correct_loss_uncond": -9.126009941101074, "incorrect_loss_uncond": -8.131213029225668}, "model_output": [{"sum_logits": -3.531843662261963, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -11.268070220947266, "logits_per_token": -3.531843662261963, "logits_per_char": -0.5045490946088519, "num_chars": 7}, {"sum_logits": -2.563351631164551, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -11.070838928222656, "logits_per_token": -2.563351631164551, "logits_per_char": -0.5126703262329102, "num_chars": 5}, {"sum_logits": -3.8592796325683594, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -12.009204864501953, "logits_per_token": -3.8592796325683594, "logits_per_char": -0.5513256617954799, "num_chars": 7}, {"sum_logits": -2.0308713912963867, "num_tokens": 1, "num_tokens_all": 212, "is_greedy": true, "sum_logits_uncond": -11.156881332397461, "logits_per_token": -2.0308713912963867, "logits_per_char": -0.33847856521606445, "num_chars": 6}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 528, "native_id": "NYSEDREGENTS_2013_4_21", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.16081428527832, "incorrect_loss_raw": 19.087063471476238, "correct_loss_per_char": 0.461737551007952, "incorrect_loss_per_char": 0.5453446706136068, "correct_loss_per_token": 2.30868775503976, "incorrect_loss_per_token": 2.7267233530680337, "correct_loss_uncond": -18.995786666870117, "incorrect_loss_uncond": -19.32115109761556}, "model_output": [{"sum_logits": -17.77572250366211, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -38.22512435913086, "logits_per_token": -2.5393889290945872, "logits_per_char": -0.5078777858189174, "num_chars": 35}, {"sum_logits": -19.70529556274414, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -39.2296142578125, "logits_per_token": -2.8150422232491628, "logits_per_char": -0.5630084446498326, "num_chars": 35}, {"sum_logits": -19.78017234802246, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -37.76990509033203, "logits_per_token": -2.8257389068603516, "logits_per_char": -0.5651477813720703, "num_chars": 35}, {"sum_logits": -16.16081428527832, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -35.15660095214844, "logits_per_token": -2.30868775503976, "logits_per_char": -0.461737551007952, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 529, "native_id": "Mercury_7032620", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.311140537261963, "incorrect_loss_raw": 9.183895746866861, "correct_loss_per_char": 0.3046308557192485, "incorrect_loss_per_char": 0.30158313293528266, "correct_loss_per_token": 1.218523422876994, "incorrect_loss_per_token": 1.3769599975101532, "correct_loss_uncond": -16.265212535858154, "incorrect_loss_uncond": -18.972909291585285}, "model_output": [{"sum_logits": -8.186836242675781, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -26.1699161529541, "logits_per_token": -1.3644727071126301, "logits_per_char": -0.34111817677815753, "num_chars": 24}, {"sum_logits": -7.311140537261963, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -23.576353073120117, "logits_per_token": -1.218523422876994, "logits_per_char": -0.3046308557192485, "num_chars": 24}, {"sum_logits": -7.048630714416504, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.89313507080078, "logits_per_token": -1.0069472449166434, "logits_per_char": -0.2013894489833287, "num_chars": 35}, {"sum_logits": -12.3162202835083, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.407363891601562, "logits_per_token": -1.7594600405011858, "logits_per_char": -0.3622417730443618, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 530, "native_id": "NYSEDREGENTS_2008_8_9", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.090174674987793, "incorrect_loss_raw": 9.665074030558268, "correct_loss_per_char": 0.2679039302625154, "incorrect_loss_per_char": 0.5561327475088614, "correct_loss_per_token": 1.6967248916625977, "incorrect_loss_per_token": 4.3317305776807995, "correct_loss_uncond": -15.407710075378418, "incorrect_loss_uncond": -10.749792098999023}, "model_output": [{"sum_logits": -9.01451587677002, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -21.33083152770996, "logits_per_token": -3.0048386255900064, "logits_per_char": -0.5008064375983344, "num_chars": 18}, {"sum_logits": -5.179731369018555, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -19.05302619934082, "logits_per_token": -2.5898656845092773, "logits_per_char": -0.345315424601237, "num_chars": 15}, {"sum_logits": -14.80097484588623, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -20.860740661621094, "logits_per_token": -7.400487422943115, "logits_per_char": -0.8222763803270128, "num_chars": 18}, {"sum_logits": -5.090174674987793, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -20.49788475036621, "logits_per_token": -1.6967248916625977, "logits_per_char": -0.2679039302625154, "num_chars": 19}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 531, "native_id": "TAKS_2009_8_27", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.95449447631836, "incorrect_loss_raw": 31.034574508666992, "correct_loss_per_char": 0.40909007338226816, "incorrect_loss_per_char": 0.5127128585480881, "correct_loss_per_token": 2.2685904069380327, "incorrect_loss_per_token": 2.9808448557099108, "correct_loss_uncond": -14.111255645751953, "incorrect_loss_uncond": -11.295942306518555}, "model_output": [{"sum_logits": -33.67823791503906, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -43.57643508911133, "logits_per_token": -3.74202643500434, "logits_per_char": -0.6476584214430589, "num_chars": 52}, {"sum_logits": -26.63875389099121, "num_tokens": 12, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -36.35916519165039, "logits_per_token": -2.219896157582601, "logits_per_char": -0.3860688969708871, "num_chars": 69}, {"sum_logits": -24.95449447631836, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -39.06575012207031, "logits_per_token": -2.2685904069380327, "logits_per_char": -0.40909007338226816, "num_chars": 61}, {"sum_logits": -32.7867317199707, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -47.05595016479492, "logits_per_token": -2.980611974542791, "logits_per_char": -0.5044112572303185, "num_chars": 65}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 532, "native_id": "NCEOGA_2013_8_57", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.14165115356445, "incorrect_loss_raw": 33.0494270324707, "correct_loss_per_char": 0.4464118215772841, "incorrect_loss_per_char": 0.48354612125788715, "correct_loss_per_token": 2.1427767435709635, "incorrect_loss_per_token": 2.197365089699074, "correct_loss_uncond": -14.66689682006836, "incorrect_loss_uncond": -17.52957280476888}, "model_output": [{"sum_logits": -32.75578689575195, "num_tokens": 18, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -48.95471954345703, "logits_per_token": -1.8197659386528864, "logits_per_char": -0.43099719599673625, "num_chars": 76}, {"sum_logits": -29.894325256347656, "num_tokens": 12, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -47.88902282714844, "logits_per_token": -2.4911937713623047, "logits_per_char": -0.49823875427246095, "num_chars": 60}, {"sum_logits": -32.14165115356445, "num_tokens": 15, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -46.80854797363281, "logits_per_token": -2.1427767435709635, "logits_per_char": -0.4464118215772841, "num_chars": 72}, {"sum_logits": -36.4981689453125, "num_tokens": 16, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -54.89325714111328, "logits_per_token": -2.2811355590820312, "logits_per_char": -0.5214024135044643, "num_chars": 70}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 533, "native_id": "Mercury_SC_413143", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.699156761169434, "incorrect_loss_raw": 8.899980545043945, "correct_loss_per_char": 0.39159639676411945, "incorrect_loss_per_char": 0.8549683741160802, "correct_loss_per_token": 1.1747891902923584, "incorrect_loss_per_token": 4.449990272521973, "correct_loss_uncond": -12.44609546661377, "incorrect_loss_uncond": -6.017712910970052}, "model_output": [{"sum_logits": -11.445659637451172, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.796025276184082, "logits_per_token": -5.722829818725586, "logits_per_char": -1.144565963745117, "num_chars": 10}, {"sum_logits": -6.173954963684082, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.690596580505371, "logits_per_token": -3.086977481842041, "logits_per_char": -0.7717443704605103, "num_chars": 8}, {"sum_logits": -4.699156761169434, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.145252227783203, "logits_per_token": -1.1747891902923584, "logits_per_char": -0.39159639676411945, "num_chars": 12}, {"sum_logits": -9.080327033996582, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.266458511352539, "logits_per_token": -4.540163516998291, "logits_per_char": -0.648594788142613, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 534, "native_id": "Mercury_401195", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.365697860717773, "incorrect_loss_raw": 5.698397954305013, "correct_loss_per_char": 1.0261212757655553, "incorrect_loss_per_char": 0.4674013689125613, "correct_loss_per_token": 7.182848930358887, "incorrect_loss_per_token": 2.8491989771525064, "correct_loss_uncond": -3.466512680053711, "incorrect_loss_uncond": -14.950444539388021}, "model_output": [{"sum_logits": -6.23402738571167, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.24663734436035, "logits_per_token": -3.117013692855835, "logits_per_char": -0.5667297623374246, "num_chars": 11}, {"sum_logits": -4.759924411773682, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -22.886638641357422, "logits_per_token": -2.379962205886841, "logits_per_char": -0.36614803167489857, "num_chars": 13}, {"sum_logits": -6.1012420654296875, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.813251495361328, "logits_per_token": -3.0506210327148438, "logits_per_char": -0.4693263127253606, "num_chars": 13}, {"sum_logits": -14.365697860717773, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.832210540771484, "logits_per_token": -7.182848930358887, "logits_per_char": -1.0261212757655553, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 535, "native_id": "CSZ10358", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.9638185501098633, "incorrect_loss_raw": 7.653510570526123, "correct_loss_per_char": 0.3704773187637329, "incorrect_loss_per_char": 0.6470659017562866, "correct_loss_per_token": 2.9638185501098633, "incorrect_loss_per_token": 4.958873987197876, "correct_loss_uncond": -11.054112434387207, "incorrect_loss_uncond": -9.019921779632568}, "model_output": [{"sum_logits": -2.9638185501098633, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.01793098449707, "logits_per_token": -2.9638185501098633, "logits_per_char": -0.3704773187637329, "num_chars": 8}, {"sum_logits": -6.515836238861084, "num_tokens": 2, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -15.985856056213379, "logits_per_token": -3.257918119430542, "logits_per_char": -0.5429863532384237, "num_chars": 12}, {"sum_logits": -9.651983261108398, "num_tokens": 2, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -20.008464813232422, "logits_per_token": -4.825991630554199, "logits_per_char": -0.6434655507405599, "num_chars": 15}, {"sum_logits": -6.792712211608887, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -6.792712211608887, "logits_per_char": -0.7547458012898763, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 536, "native_id": "MCAS_1999_4_26", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.775847911834717, "incorrect_loss_raw": 8.315437316894531, "correct_loss_per_char": 0.28232699632644653, "incorrect_loss_per_char": 0.327382151897137, "correct_loss_per_token": 1.3551695823669434, "incorrect_loss_per_token": 1.6630874633789061, "correct_loss_uncond": -20.224838733673096, "incorrect_loss_uncond": -17.131012598673504}, "model_output": [{"sum_logits": -9.868776321411133, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -28.16634750366211, "logits_per_token": -1.9737552642822265, "logits_per_char": -0.3795683200542743, "num_chars": 26}, {"sum_logits": -8.003584861755371, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -24.400901794433594, "logits_per_token": -1.6007169723510741, "logits_per_char": -0.30783018699059117, "num_chars": 26}, {"sum_logits": -6.775847911834717, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.000686645507812, "logits_per_token": -1.3551695823669434, "logits_per_char": -0.28232699632644653, "num_chars": 24}, {"sum_logits": -7.07395076751709, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -23.7721004486084, "logits_per_token": -1.414790153503418, "logits_per_char": -0.2947479486465454, "num_chars": 24}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 537, "native_id": "AKDE&ED_2008_8_36", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.50910758972168, "incorrect_loss_raw": 15.956907272338867, "correct_loss_per_char": 0.46803572870069937, "incorrect_loss_per_char": 0.521402357641514, "correct_loss_per_token": 2.901821517944336, "incorrect_loss_per_token": 3.1913814544677734, "correct_loss_uncond": -16.102148056030273, "incorrect_loss_uncond": -17.295296986897785}, "model_output": [{"sum_logits": -14.50910758972168, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.611255645751953, "logits_per_token": -2.901821517944336, "logits_per_char": -0.46803572870069937, "num_chars": 31}, {"sum_logits": -14.454920768737793, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -32.16136169433594, "logits_per_token": -2.8909841537475587, "logits_per_char": -0.4662877667334772, "num_chars": 31}, {"sum_logits": -14.824877738952637, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.692548751831055, "logits_per_token": -2.9649755477905275, "logits_per_char": -0.4782218625468592, "num_chars": 31}, {"sum_logits": -18.590923309326172, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -36.90270233154297, "logits_per_token": -3.7181846618652346, "logits_per_char": -0.6196974436442058, "num_chars": 30}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 538, "native_id": "Mercury_7017938", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.031474113464355, "incorrect_loss_raw": 12.494290669759115, "correct_loss_per_char": 0.6489102419684915, "incorrect_loss_per_char": 0.5852345926667839, "correct_loss_per_token": 5.515737056732178, "incorrect_loss_per_token": 5.2532255384657125, "correct_loss_uncond": -12.909880638122559, "incorrect_loss_uncond": -9.985980987548828}, "model_output": [{"sum_logits": -8.954774856567383, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -19.645198822021484, "logits_per_token": -4.477387428283691, "logits_per_char": -0.47130393981933594, "num_chars": 19}, {"sum_logits": -11.031474113464355, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -23.941354751586914, "logits_per_token": -5.515737056732178, "logits_per_char": -0.6489102419684915, "num_chars": 17}, {"sum_logits": -10.637540817260742, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -21.891212463378906, "logits_per_token": -5.318770408630371, "logits_per_char": -0.506549562726702, "num_chars": 21}, {"sum_logits": -17.89055633544922, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -25.904403686523438, "logits_per_token": -5.963518778483073, "logits_per_char": -0.7778502754543138, "num_chars": 23}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 539, "native_id": "MDSA_2013_8_32", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 12.975168228149414, "incorrect_loss_raw": 8.397099494934082, "correct_loss_per_char": 0.381622594945571, "incorrect_loss_per_char": 0.3855575409045213, "correct_loss_per_token": 2.1625280380249023, "incorrect_loss_per_token": 2.2614369922214084, "correct_loss_uncond": -21.568971633911133, "incorrect_loss_uncond": -14.700478553771973}, "model_output": [{"sum_logits": -5.724356174468994, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -20.200183868408203, "logits_per_token": -1.908118724822998, "logits_per_char": -0.3367268337922938, "num_chars": 17}, {"sum_logits": -7.371028423309326, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -24.333621978759766, "logits_per_token": -2.457009474436442, "logits_per_char": -0.3879488643847014, "num_chars": 19}, {"sum_logits": -12.095913887023926, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -24.758928298950195, "logits_per_token": -2.419182777404785, "logits_per_char": -0.4319969245365688, "num_chars": 28}, {"sum_logits": -12.975168228149414, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -34.54413986206055, "logits_per_token": -2.1625280380249023, "logits_per_char": -0.381622594945571, "num_chars": 34}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 540, "native_id": "Mercury_7038028", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.12459945678711, "incorrect_loss_raw": 19.56641133626302, "correct_loss_per_char": 0.6164681694724343, "incorrect_loss_per_char": 0.71320132216869, "correct_loss_per_token": 3.3905749320983887, "incorrect_loss_per_token": 2.951991346147325, "correct_loss_uncond": -7.320110321044922, "incorrect_loss_uncond": -11.912391662597656}, "model_output": [{"sum_logits": -20.050779342651367, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -29.845829010009766, "logits_per_token": -3.341796557108561, "logits_per_char": -1.0025389671325684, "num_chars": 20}, {"sum_logits": -16.394895553588867, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -30.722633361816406, "logits_per_token": -2.732482592264811, "logits_per_char": -0.6072183538366247, "num_chars": 27}, {"sum_logits": -27.12459945678711, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -34.44470977783203, "logits_per_token": -3.3905749320983887, "logits_per_char": -0.6164681694724343, "num_chars": 44}, {"sum_logits": -22.253559112548828, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -33.86794662475586, "logits_per_token": -2.7816948890686035, "logits_per_char": -0.5298466455368769, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 541, "native_id": "Mercury_7057103", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.235862731933594, "incorrect_loss_raw": 28.705254872639973, "correct_loss_per_char": 0.7013793358435998, "incorrect_loss_per_char": 0.9727876863025484, "correct_loss_per_token": 2.6051232474190846, "incorrect_loss_per_token": 4.863333650619264, "correct_loss_uncond": -24.173599243164062, "incorrect_loss_uncond": -13.519161224365234}, "model_output": [{"sum_logits": -24.59979248046875, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.65984344482422, "logits_per_token": -3.514256068638393, "logits_per_char": -0.8785640171595982, "num_chars": 28}, {"sum_logits": -24.692485809326172, "num_tokens": 5, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.855751037597656, "logits_per_token": -4.938497161865234, "logits_per_char": -0.9876994323730469, "num_chars": 25}, {"sum_logits": -18.235862731933594, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -42.409461975097656, "logits_per_token": -2.6051232474190846, "logits_per_char": -0.7013793358435998, "num_chars": 26}, {"sum_logits": -36.823486328125, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -46.15765380859375, "logits_per_token": -6.137247721354167, "logits_per_char": -1.052099609375, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 542, "native_id": "NYSEDREGENTS_2008_4_26", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 29.542430877685547, "incorrect_loss_raw": 31.76331837972005, "correct_loss_per_char": 0.590848617553711, "incorrect_loss_per_char": 0.7542527481227562, "correct_loss_per_token": 2.68567553433505, "incorrect_loss_per_token": 3.509740751852721, "correct_loss_uncond": -9.931140899658203, "incorrect_loss_uncond": -4.970436096191406}, "model_output": [{"sum_logits": -29.946035385131836, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -34.32032012939453, "logits_per_token": -4.278005055018833, "logits_per_char": -0.8318343162536621, "num_chars": 36}, {"sum_logits": -34.19469451904297, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -40.08403015136719, "logits_per_token": -3.419469451904297, "logits_per_char": -0.7952254539312318, "num_chars": 43}, {"sum_logits": -29.542430877685547, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -39.47357177734375, "logits_per_token": -2.68567553433505, "logits_per_char": -0.590848617553711, "num_chars": 50}, {"sum_logits": -31.14922523498535, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -35.796913146972656, "logits_per_token": -2.8317477486350318, "logits_per_char": -0.6356984741833746, "num_chars": 49}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 543, "native_id": "Mercury_417117", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.5236899852752686, "incorrect_loss_raw": 8.413339614868164, "correct_loss_per_char": 0.44046124815940857, "incorrect_loss_per_char": 0.855419548432835, "correct_loss_per_token": 3.5236899852752686, "incorrect_loss_per_token": 8.413339614868164, "correct_loss_uncond": -9.58694577217102, "incorrect_loss_uncond": -4.892177581787109}, "model_output": [{"sum_logits": -9.605354309082031, "num_tokens": 1, "num_tokens_all": 267, "is_greedy": false, "sum_logits_uncond": -12.360881805419922, "logits_per_token": -9.605354309082031, "logits_per_char": -1.0672615898980036, "num_chars": 9}, {"sum_logits": -7.0916337966918945, "num_tokens": 1, "num_tokens_all": 267, "is_greedy": false, "sum_logits_uncond": -13.425239562988281, "logits_per_token": -7.0916337966918945, "logits_per_char": -0.644693981517445, "num_chars": 11}, {"sum_logits": -3.5236899852752686, "num_tokens": 1, "num_tokens_all": 267, "is_greedy": false, "sum_logits_uncond": -13.110635757446289, "logits_per_token": -3.5236899852752686, "logits_per_char": -0.44046124815940857, "num_chars": 8}, {"sum_logits": -8.543030738830566, "num_tokens": 1, "num_tokens_all": 267, "is_greedy": false, "sum_logits_uncond": -14.130430221557617, "logits_per_token": -8.543030738830566, "logits_per_char": -0.8543030738830566, "num_chars": 10}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 544, "native_id": "MCAS_2016_8_15", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 73.58576202392578, "incorrect_loss_raw": 80.11891428629558, "correct_loss_per_char": 0.5040120686570259, "incorrect_loss_per_char": 0.5714412279589802, "correct_loss_per_token": 2.8302216163048377, "incorrect_loss_per_token": 3.2364885762077127, "correct_loss_uncond": -29.263450622558594, "incorrect_loss_uncond": -23.25354766845703}, "model_output": [{"sum_logits": -64.86454772949219, "num_tokens": 21, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -88.84718322753906, "logits_per_token": -3.0887879871186756, "logits_per_char": -0.5405378977457682, "num_chars": 120}, {"sum_logits": -84.918701171875, "num_tokens": 26, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -105.8927001953125, "logits_per_token": -3.2661038912259617, "logits_per_char": -0.5856462149784483, "num_chars": 145}, {"sum_logits": -73.58576202392578, "num_tokens": 26, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -102.84921264648438, "logits_per_token": -2.8302216163048377, "logits_per_char": -0.5040120686570259, "num_chars": 146}, {"sum_logits": -90.57349395751953, "num_tokens": 27, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -115.37750244140625, "logits_per_token": -3.354573850278501, "logits_per_char": -0.5881395711527242, "num_chars": 154}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 545, "native_id": "Mercury_400780", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.013845443725586, "incorrect_loss_raw": 19.21067746480306, "correct_loss_per_char": 1.5724175316946847, "incorrect_loss_per_char": 1.4388282831771904, "correct_loss_per_token": 3.668974240620931, "incorrect_loss_per_token": 3.201779577467177, "correct_loss_uncond": -12.534318923950195, "incorrect_loss_uncond": -12.912081400553385}, "model_output": [{"sum_logits": -17.428466796875, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.959421157836914, "logits_per_token": -2.9047444661458335, "logits_per_char": -1.3406512920673077, "num_chars": 13}, {"sum_logits": -18.955354690551758, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -30.837665557861328, "logits_per_token": -3.1592257817586265, "logits_per_char": -1.45810420696552, "num_chars": 13}, {"sum_logits": -22.013845443725586, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.54816436767578, "logits_per_token": -3.668974240620931, "logits_per_char": -1.5724175316946847, "num_chars": 14}, {"sum_logits": -21.248210906982422, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -36.571189880371094, "logits_per_token": -3.5413684844970703, "logits_per_char": -1.5177293504987444, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 546, "native_id": "NYSEDREGENTS_2008_8_32", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.790182113647461, "incorrect_loss_raw": 13.765377680460611, "correct_loss_per_char": 0.36259933754249857, "incorrect_loss_per_char": 0.6906039025280594, "correct_loss_per_token": 1.6316970189412434, "incorrect_loss_per_token": 3.514739799499512, "correct_loss_uncond": -20.271697998046875, "incorrect_loss_uncond": -9.18525759379069}, "model_output": [{"sum_logits": -9.790182113647461, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.061880111694336, "logits_per_token": -1.6316970189412434, "logits_per_char": -0.36259933754249857, "num_chars": 27}, {"sum_logits": -10.730372428894043, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.488739013671875, "logits_per_token": -3.5767908096313477, "logits_per_char": -0.5365186214447022, "num_chars": 20}, {"sum_logits": -17.085529327392578, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.003711700439453, "logits_per_token": -4.2713823318481445, "logits_per_char": -0.9491960737440321, "num_chars": 18}, {"sum_logits": -13.480231285095215, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -20.359455108642578, "logits_per_token": -2.696046257019043, "logits_per_char": -0.5860970123954441, "num_chars": 23}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 547, "native_id": "Mercury_SC_416104", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.3801980018615723, "incorrect_loss_raw": 7.583134174346924, "correct_loss_per_char": 0.21126237511634827, "incorrect_loss_per_char": 0.5338631593264066, "correct_loss_per_token": 1.6900990009307861, "incorrect_loss_per_token": 2.864119291305542, "correct_loss_uncond": -13.281724452972412, "incorrect_loss_uncond": -13.542842388153076}, "model_output": [{"sum_logits": -5.0648908615112305, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -19.44003677368164, "logits_per_token": -2.5324454307556152, "logits_per_char": -0.3896069893470177, "num_chars": 13}, {"sum_logits": -6.555138111114502, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -18.70823097229004, "logits_per_token": -3.277569055557251, "logits_per_char": -0.6555138111114502, "num_chars": 10}, {"sum_logits": -3.3801980018615723, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -16.661922454833984, "logits_per_token": -1.6900990009307861, "logits_per_char": -0.21126237511634827, "num_chars": 16}, {"sum_logits": -11.129373550415039, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -25.22966194152832, "logits_per_token": -2.7823433876037598, "logits_per_char": -0.5564686775207519, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 548, "native_id": "Mercury_416646", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.269805669784546, "incorrect_loss_raw": 7.987069845199585, "correct_loss_per_char": 0.32698056697845457, "incorrect_loss_per_char": 0.8012663622697195, "correct_loss_per_token": 1.634902834892273, "incorrect_loss_per_token": 3.5689048899544606, "correct_loss_uncond": -13.13248085975647, "incorrect_loss_uncond": -11.558210293451944}, "model_output": [{"sum_logits": -13.590429306030273, "num_tokens": 2, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -21.04105567932129, "logits_per_token": -6.795214653015137, "logits_per_char": -1.6988036632537842, "num_chars": 8}, {"sum_logits": -3.269805669784546, "num_tokens": 2, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -16.402286529541016, "logits_per_token": -1.634902834892273, "logits_per_char": -0.32698056697845457, "num_chars": 10}, {"sum_logits": -2.7274396419525146, "num_tokens": 2, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -15.988968849182129, "logits_per_token": -1.3637198209762573, "logits_per_char": -0.22728663682937622, "num_chars": 12}, {"sum_logits": -7.643340587615967, "num_tokens": 3, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -21.605815887451172, "logits_per_token": -2.547780195871989, "logits_per_char": -0.4777087867259979, "num_chars": 16}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 549, "native_id": "Mercury_SC_405296", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.133705139160156, "incorrect_loss_raw": 12.748522758483887, "correct_loss_per_char": 0.4323915754045759, "incorrect_loss_per_char": 0.6114636675205106, "correct_loss_per_token": 2.522284189860026, "incorrect_loss_per_token": 3.9564994441138377, "correct_loss_uncond": -9.912557601928711, "incorrect_loss_uncond": -10.787673632303873}, "model_output": [{"sum_logits": -14.03669548034668, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.91524887084961, "logits_per_token": -4.6788984934488935, "logits_per_char": -0.8256879694321576, "num_chars": 17}, {"sum_logits": -10.548293113708496, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.088794708251953, "logits_per_token": -2.637073278427124, "logits_per_char": -0.4395122130711873, "num_chars": 24}, {"sum_logits": -15.133705139160156, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.046262741088867, "logits_per_token": -2.522284189860026, "logits_per_char": -0.4323915754045759, "num_chars": 35}, {"sum_logits": -13.660579681396484, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.60454559326172, "logits_per_token": -4.553526560465495, "logits_per_char": -0.5691908200581869, "num_chars": 24}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 550, "native_id": "MCAS_2006_8_31", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 2.340904951095581, "incorrect_loss_raw": 8.605022271474203, "correct_loss_per_char": 0.39015082518259686, "incorrect_loss_per_char": 0.6855423755115932, "correct_loss_per_token": 2.340904951095581, "incorrect_loss_per_token": 5.7757963339487715, "correct_loss_uncond": -9.726050615310669, "incorrect_loss_uncond": -4.752559502919515}, "model_output": [{"sum_logits": -3.234544277191162, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -1.617272138595581, "logits_per_char": -0.21563628514607747, "num_chars": 15}, {"sum_logits": -8.83971118927002, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -13.184402465820312, "logits_per_token": -8.83971118927002, "logits_per_char": -0.9821901321411133, "num_chars": 9}, {"sum_logits": -13.740811347961426, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -14.634757041931152, "logits_per_token": -6.870405673980713, "logits_per_char": -0.8588007092475891, "num_chars": 16}, {"sum_logits": -2.340904951095581, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -12.06695556640625, "logits_per_token": -2.340904951095581, "logits_per_char": -0.39015082518259686, "num_chars": 6}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 551, "native_id": "MCAS_2015_5_14", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 12.703018188476562, "incorrect_loss_raw": 14.302003224690756, "correct_loss_per_char": 0.5774099176580255, "incorrect_loss_per_char": 0.5641477122298414, "correct_loss_per_token": 2.5406036376953125, "incorrect_loss_per_token": 2.5417775048149953, "correct_loss_uncond": -15.095218658447266, "incorrect_loss_uncond": -12.149358113606771}, "model_output": [{"sum_logits": -12.703018188476562, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.798236846923828, "logits_per_token": -2.5406036376953125, "logits_per_char": -0.5774099176580255, "num_chars": 22}, {"sum_logits": -14.229927062988281, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -25.79083251953125, "logits_per_token": -2.8459854125976562, "logits_per_char": -0.6776155744280133, "num_chars": 21}, {"sum_logits": -15.91869831085205, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.216894149780273, "logits_per_token": -2.6531163851420083, "logits_per_char": -0.5895814189204464, "num_chars": 27}, {"sum_logits": -12.757384300231934, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.346357345581055, "logits_per_token": -2.1262307167053223, "logits_per_char": -0.4252461433410645, "num_chars": 30}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 552, "native_id": "Mercury_417465", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.973390579223633, "incorrect_loss_raw": 9.502401987711588, "correct_loss_per_char": 0.7982260386149088, "incorrect_loss_per_char": 0.7064645045298024, "correct_loss_per_token": 3.9911301930745444, "incorrect_loss_per_token": 4.067314889695909, "correct_loss_uncond": -7.039140701293945, "incorrect_loss_uncond": -8.676216761271158}, "model_output": [{"sum_logits": -6.006986618041992, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -14.54490852355957, "logits_per_token": -3.003493309020996, "logits_per_char": -0.546089692549272, "num_chars": 11}, {"sum_logits": -10.190269470214844, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -20.641033172607422, "logits_per_token": -5.095134735107422, "logits_per_char": -0.8491891225179037, "num_chars": 12}, {"sum_logits": -11.973390579223633, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.012531280517578, "logits_per_token": -3.9911301930745444, "logits_per_char": -0.7982260386149088, "num_chars": 15}, {"sum_logits": -12.30994987487793, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.34991455078125, "logits_per_token": -4.10331662495931, "logits_per_char": -0.7241146985222312, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 553, "native_id": "MCAS_1998_4_19", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.132704734802246, "incorrect_loss_raw": 13.583661715189615, "correct_loss_per_char": 0.7925227483113607, "incorrect_loss_per_char": 1.4475986813742017, "correct_loss_per_token": 2.377568244934082, "incorrect_loss_per_token": 5.185934331681993, "correct_loss_uncond": -8.113543510437012, "incorrect_loss_uncond": -1.9141003290812175}, "model_output": [{"sum_logits": -11.844847679138184, "num_tokens": 2, "num_tokens_all": 172, "is_greedy": false, "sum_logits_uncond": -11.842113494873047, "logits_per_token": -5.922423839569092, "logits_per_char": -1.6921210970197404, "num_chars": 7}, {"sum_logits": -14.509809494018555, "num_tokens": 3, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -16.377567291259766, "logits_per_token": -4.836603164672852, "logits_per_char": -1.4509809494018555, "num_chars": 10}, {"sum_logits": -14.39632797241211, "num_tokens": 3, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -18.273605346679688, "logits_per_token": -4.798775990804036, "logits_per_char": -1.199693997701009, "num_chars": 12}, {"sum_logits": -7.132704734802246, "num_tokens": 3, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -15.246248245239258, "logits_per_token": -2.377568244934082, "logits_per_char": -0.7925227483113607, "num_chars": 9}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 554, "native_id": "Mercury_7214778", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.220112800598145, "incorrect_loss_raw": 11.312775611877441, "correct_loss_per_char": 0.472146885735648, "incorrect_loss_per_char": 0.5195465411421595, "correct_loss_per_token": 2.203352133433024, "incorrect_loss_per_token": 2.43162210782369, "correct_loss_uncond": -27.733210563659668, "incorrect_loss_uncond": -15.137040138244629}, "model_output": [{"sum_logits": -14.27658462524414, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -35.41744613647461, "logits_per_token": -2.3794307708740234, "logits_per_char": -0.5287623935275607, "num_chars": 27}, {"sum_logits": -8.354138374328613, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.768095016479492, "logits_per_token": -2.0885345935821533, "logits_per_char": -0.49141990437227134, "num_chars": 17}, {"sum_logits": -11.30760383605957, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -22.16390609741211, "logits_per_token": -2.8269009590148926, "logits_per_char": -0.5384573255266462, "num_chars": 21}, {"sum_logits": -13.220112800598145, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -40.95332336425781, "logits_per_token": -2.203352133433024, "logits_per_char": -0.472146885735648, "num_chars": 28}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 555, "native_id": "Mercury_7123393", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 14.7727632522583, "incorrect_loss_raw": 13.830140749613443, "correct_loss_per_char": 0.7034649167742048, "incorrect_loss_per_char": 0.7292787449032652, "correct_loss_per_token": 2.462127208709717, "incorrect_loss_per_token": 3.8570744726392956, "correct_loss_uncond": -15.795302391052246, "incorrect_loss_uncond": -9.558600425720215}, "model_output": [{"sum_logits": -12.84238052368164, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -21.29667854309082, "logits_per_token": -4.28079350789388, "logits_per_char": -0.7554341484518612, "num_chars": 17}, {"sum_logits": -15.094537734985352, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -21.70810317993164, "logits_per_token": -5.03151257832845, "logits_per_char": -0.7547268867492676, "num_chars": 20}, {"sum_logits": -13.55350399017334, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -27.161441802978516, "logits_per_token": -2.2589173316955566, "logits_per_char": -0.677675199508667, "num_chars": 20}, {"sum_logits": -14.7727632522583, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -30.568065643310547, "logits_per_token": -2.462127208709717, "logits_per_char": -0.7034649167742048, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 556, "native_id": "Mercury_7207550", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.04043197631836, "incorrect_loss_raw": 26.69436518351237, "correct_loss_per_char": 0.5466797568581321, "incorrect_loss_per_char": 0.586081565369372, "correct_loss_per_token": 3.0067386627197266, "incorrect_loss_per_token": 3.1622060316580316, "correct_loss_uncond": -21.201412200927734, "incorrect_loss_uncond": -12.911761601765951}, "model_output": [{"sum_logits": -21.241596221923828, "num_tokens": 8, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -28.936548233032227, "logits_per_token": -2.6551995277404785, "logits_per_char": -0.5180877127298494, "num_chars": 41}, {"sum_logits": -37.71135711669922, "num_tokens": 9, "num_tokens_all": 240, "is_greedy": false, "sum_logits_uncond": -55.51003646850586, "logits_per_token": -4.190150790744358, "logits_per_char": -0.6983584651240596, "num_chars": 54}, {"sum_logits": -21.130142211914062, "num_tokens": 8, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -34.371795654296875, "logits_per_token": -2.641267776489258, "logits_per_char": -0.5417985182542068, "num_chars": 39}, {"sum_logits": -18.04043197631836, "num_tokens": 6, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -39.241844177246094, "logits_per_token": -3.0067386627197266, "logits_per_char": -0.5466797568581321, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 557, "native_id": "Mercury_SC_405827", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.43327522277832, "incorrect_loss_raw": 13.438870429992676, "correct_loss_per_char": 0.385831880569458, "incorrect_loss_per_char": 0.3817415548795453, "correct_loss_per_token": 1.7148083580864801, "incorrect_loss_per_token": 1.614646848042806, "correct_loss_uncond": -17.48903465270996, "incorrect_loss_uncond": -16.90472952524821}, "model_output": [{"sum_logits": -16.531784057617188, "num_tokens": 9, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -36.50025939941406, "logits_per_token": -1.8368648952907987, "logits_per_char": -0.4032142453077363, "num_chars": 41}, {"sum_logits": -15.43327522277832, "num_tokens": 9, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -32.92230987548828, "logits_per_token": -1.7148083580864801, "logits_per_char": -0.385831880569458, "num_chars": 40}, {"sum_logits": -19.686260223388672, "num_tokens": 9, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -32.450504302978516, "logits_per_token": -2.1873622470431857, "logits_per_char": -0.5468405617607964, "num_chars": 36}, {"sum_logits": -4.098567008972168, "num_tokens": 5, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -22.080036163330078, "logits_per_token": -0.8197134017944336, "logits_per_char": -0.19516985757010324, "num_chars": 21}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 558, "native_id": "NYSEDREGENTS_2015_4_11", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 12.341348648071289, "incorrect_loss_raw": 14.232860883076986, "correct_loss_per_char": 0.6170674324035644, "incorrect_loss_per_char": 0.7251874605814616, "correct_loss_per_token": 4.11378288269043, "incorrect_loss_per_token": 4.7442869610256615, "correct_loss_uncond": -17.117366790771484, "incorrect_loss_uncond": -13.50611400604248}, "model_output": [{"sum_logits": -11.5894775390625, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.03769302368164, "logits_per_token": -3.8631591796875, "logits_per_char": -0.579473876953125, "num_chars": 20}, {"sum_logits": -15.66847038269043, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -27.355506896972656, "logits_per_token": -5.22282346089681, "logits_per_char": -0.7834235191345215, "num_chars": 20}, {"sum_logits": -15.440634727478027, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -29.8237247467041, "logits_per_token": -5.146878242492676, "logits_per_char": -0.8126649856567383, "num_chars": 19}, {"sum_logits": -12.341348648071289, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -29.458715438842773, "logits_per_token": -4.11378288269043, "logits_per_char": -0.6170674324035644, "num_chars": 20}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 559, "native_id": "Mercury_404097", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.579469680786133, "incorrect_loss_raw": 9.812568664550781, "correct_loss_per_char": 0.6789734840393067, "incorrect_loss_per_char": 0.5285455726441883, "correct_loss_per_token": 3.394867420196533, "incorrect_loss_per_token": 2.8934659163157144, "correct_loss_uncond": -10.030889511108398, "incorrect_loss_uncond": -10.856555302937826}, "model_output": [{"sum_logits": -13.586050987243652, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -24.4489688873291, "logits_per_token": -3.396512746810913, "logits_per_char": -0.7547806104024252, "num_chars": 18}, {"sum_logits": -10.743420600891113, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -21.032546997070312, "logits_per_token": -3.5811402002970376, "logits_per_char": -0.5115914571852911, "num_chars": 21}, {"sum_logits": -13.579469680786133, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -23.61035919189453, "logits_per_token": -3.394867420196533, "logits_per_char": -0.6789734840393067, "num_chars": 20}, {"sum_logits": -5.108234405517578, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.525856018066406, "logits_per_token": -1.7027448018391926, "logits_per_char": -0.31926465034484863, "num_chars": 16}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 560, "native_id": "AIMS_2009_4_4", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.93837547302246, "incorrect_loss_raw": 38.41359774271647, "correct_loss_per_char": 0.25664205262155243, "incorrect_loss_per_char": 0.6234349315218518, "correct_loss_per_token": 2.1172969341278076, "incorrect_loss_per_token": 3.5921527257041324, "correct_loss_uncond": -17.092653274536133, "incorrect_loss_uncond": -7.999703089396159}, "model_output": [{"sum_logits": -30.729162216186523, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.64774703979492, "logits_per_token": -4.389880316598075, "logits_per_char": -0.5909454272343562, "num_chars": 52}, {"sum_logits": -45.14814758300781, "num_tokens": 12, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -52.67915344238281, "logits_per_token": -3.762345631917318, "logits_per_char": -0.7401335669345543, "num_chars": 61}, {"sum_logits": -16.93837547302246, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.031028747558594, "logits_per_token": -2.1172969341278076, "logits_per_char": -0.25664205262155243, "num_chars": 66}, {"sum_logits": -39.36348342895508, "num_tokens": 15, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -48.913002014160156, "logits_per_token": -2.624232228597005, "logits_per_char": -0.5392258003966449, "num_chars": 73}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 561, "native_id": "NCEOGA_2013_8_18", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.389141082763672, "incorrect_loss_raw": 15.970588366190592, "correct_loss_per_char": 0.49683260236467636, "incorrect_loss_per_char": 0.5231472165243966, "correct_loss_per_token": 3.4778282165527346, "incorrect_loss_per_token": 2.8154544406467017, "correct_loss_uncond": -7.127677917480469, "incorrect_loss_uncond": -8.957430203755697}, "model_output": [{"sum_logits": -17.389141082763672, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -24.51681900024414, "logits_per_token": -3.4778282165527346, "logits_per_char": -0.49683260236467636, "num_chars": 35}, {"sum_logits": -13.832074165344238, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -20.956756591796875, "logits_per_token": -2.766414833068848, "logits_per_char": -0.5532829666137695, "num_chars": 25}, {"sum_logits": -18.230485916137695, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -30.198774337768555, "logits_per_token": -3.0384143193562827, "logits_per_char": -0.5208710261753627, "num_chars": 35}, {"sum_logits": -15.849205017089844, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -23.628524780273438, "logits_per_token": -2.641534169514974, "logits_per_char": -0.4952876567840576, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 562, "native_id": "Mercury_400884", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.1467533111572266, "incorrect_loss_raw": 3.5646918614705405, "correct_loss_per_char": 1.0733766555786133, "incorrect_loss_per_char": 1.5286147859361436, "correct_loss_per_token": 2.1467533111572266, "incorrect_loss_per_token": 3.5646918614705405, "correct_loss_uncond": -4.091140270233154, "incorrect_loss_uncond": -3.0351556142171225}, "model_output": [{"sum_logits": -3.272390365600586, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -5.551112174987793, "logits_per_token": -3.272390365600586, "logits_per_char": -1.636195182800293, "num_chars": 2}, {"sum_logits": -2.1467533111572266, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": true, "sum_logits_uncond": -6.237893581390381, "logits_per_token": -2.1467533111572266, "logits_per_char": -1.0733766555786133, "num_chars": 2}, {"sum_logits": -2.854524612426758, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -6.466894149780273, "logits_per_token": -2.854524612426758, "logits_per_char": -1.427262306213379, "num_chars": 2}, {"sum_logits": -4.567160606384277, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -7.781536102294922, "logits_per_token": -4.567160606384277, "logits_per_char": -1.522386868794759, "num_chars": 3}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 563, "native_id": "Mercury_7219678", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.113107681274414, "incorrect_loss_raw": 18.539191246032715, "correct_loss_per_char": 0.6826109886169434, "incorrect_loss_per_char": 0.5450167384842601, "correct_loss_per_token": 4.7782769203186035, "incorrect_loss_per_token": 3.22550228966607, "correct_loss_uncond": -12.173955917358398, "incorrect_loss_uncond": -15.009727160135904}, "model_output": [{"sum_logits": -19.113107681274414, "num_tokens": 4, "num_tokens_all": 246, "is_greedy": false, "sum_logits_uncond": -31.287063598632812, "logits_per_token": -4.7782769203186035, "logits_per_char": -0.6826109886169434, "num_chars": 28}, {"sum_logits": -16.85870361328125, "num_tokens": 6, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -32.01538848876953, "logits_per_token": -2.809783935546875, "logits_per_char": -0.48167724609375, "num_chars": 35}, {"sum_logits": -12.207337379455566, "num_tokens": 5, "num_tokens_all": 247, "is_greedy": false, "sum_logits_uncond": -32.31503677368164, "logits_per_token": -2.4414674758911135, "logits_per_char": -0.34878106798444475, "num_chars": 35}, {"sum_logits": -26.551532745361328, "num_tokens": 6, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -36.31632995605469, "logits_per_token": -4.425255457560222, "logits_per_char": -0.8045919013745857, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 564, "native_id": "ACTAAP_2010_5_7", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.8184890747070312, "incorrect_loss_raw": 5.559732596079509, "correct_loss_per_char": 0.46974817911783856, "incorrect_loss_per_char": 0.9517068749382381, "correct_loss_per_token": 2.8184890747070312, "incorrect_loss_per_token": 5.559732596079509, "correct_loss_uncond": -7.534109115600586, "incorrect_loss_uncond": -5.273896376291911}, "model_output": [{"sum_logits": -4.408862113952637, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -10.638568878173828, "logits_per_token": -4.408862113952637, "logits_per_char": -1.1022155284881592, "num_chars": 4}, {"sum_logits": -2.8184890747070312, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -10.352598190307617, "logits_per_token": -2.8184890747070312, "logits_per_char": -0.46974817911783856, "num_chars": 6}, {"sum_logits": -4.206662654876709, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -10.913131713867188, "logits_per_token": -4.206662654876709, "logits_per_char": -0.6009518078395298, "num_chars": 7}, {"sum_logits": -8.06367301940918, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -10.949186325073242, "logits_per_token": -8.06367301940918, "logits_per_char": -1.1519532884870256, "num_chars": 7}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 565, "native_id": "ACTAAP_2012_7_9", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.325084686279297, "incorrect_loss_raw": 19.615869522094727, "correct_loss_per_char": 0.39837140622346295, "incorrect_loss_per_char": 0.45378052423172904, "correct_loss_per_token": 2.290635585784912, "incorrect_loss_per_token": 1.8885863108512684, "correct_loss_uncond": -19.462459564208984, "incorrect_loss_uncond": -5.9386037190755205}, "model_output": [{"sum_logits": -19.87920570373535, "num_tokens": 10, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -23.82402229309082, "logits_per_token": -1.9879205703735352, "logits_per_char": -0.5097232231727014, "num_chars": 39}, {"sum_logits": -14.149593353271484, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.676170349121094, "logits_per_token": -1.7686991691589355, "logits_per_char": -0.3451120330066216, "num_chars": 41}, {"sum_logits": -18.325084686279297, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -37.78754425048828, "logits_per_token": -2.290635585784912, "logits_per_char": -0.39837140622346295, "num_chars": 46}, {"sum_logits": -24.818809509277344, "num_tokens": 13, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -33.16322708129883, "logits_per_token": -1.9091391930213342, "logits_per_char": -0.5065063165158642, "num_chars": 49}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 566, "native_id": "MCAS_2005_8_6", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 27.815370559692383, "incorrect_loss_raw": 34.236884435017906, "correct_loss_per_char": 0.4559896813064325, "incorrect_loss_per_char": 0.5057556985616961, "correct_loss_per_token": 2.1396438892071066, "incorrect_loss_per_token": 2.8116763143828423, "correct_loss_uncond": -7.307615280151367, "incorrect_loss_uncond": -13.550071080525717}, "model_output": [{"sum_logits": -27.815370559692383, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -35.12298583984375, "logits_per_token": -2.1396438892071066, "logits_per_char": -0.4559896813064325, "num_chars": 61}, {"sum_logits": -31.21955680847168, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -43.075077056884766, "logits_per_token": -2.4015043698824368, "logits_per_char": -0.47302358800714667, "num_chars": 66}, {"sum_logits": -38.19597625732422, "num_tokens": 11, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -46.36888122558594, "logits_per_token": -3.4723614779385654, "logits_per_char": -0.5617055331959444, "num_chars": 68}, {"sum_logits": -33.29512023925781, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -53.916908264160156, "logits_per_token": -2.561163095327524, "logits_per_char": -0.48253797448199726, "num_chars": 69}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 567, "native_id": "Mercury_SC_401162", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.3221659660339355, "incorrect_loss_raw": 7.865740140279134, "correct_loss_per_char": 0.29027074575424194, "incorrect_loss_per_char": 0.9846585326724582, "correct_loss_per_token": 2.3221659660339355, "incorrect_loss_per_token": 6.433071613311768, "correct_loss_uncond": -11.695881366729736, "incorrect_loss_uncond": -8.477758089701334}, "model_output": [{"sum_logits": -8.5960111618042, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.642349243164062, "logits_per_token": -4.2980055809021, "logits_per_char": -1.228001594543457, "num_chars": 7}, {"sum_logits": -5.710478782653809, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.511032104492188, "logits_per_token": -5.710478782653809, "logits_per_char": -0.9517464637756348, "num_chars": 6}, {"sum_logits": -2.3221659660339355, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.018047332763672, "logits_per_token": -2.3221659660339355, "logits_per_char": -0.29027074575424194, "num_chars": 8}, {"sum_logits": -9.290730476379395, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.877113342285156, "logits_per_token": -9.290730476379395, "logits_per_char": -0.7742275396982828, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 568, "native_id": "Mercury_SC_407710", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.006477355957031, "incorrect_loss_raw": 8.298114140828451, "correct_loss_per_char": 0.4620367196890024, "incorrect_loss_per_char": 0.902461725051957, "correct_loss_per_token": 3.0032386779785156, "incorrect_loss_per_token": 4.149057070414226, "correct_loss_uncond": -8.838862419128418, "incorrect_loss_uncond": -5.35814889272054}, "model_output": [{"sum_logits": -7.922682762145996, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.246198654174805, "logits_per_token": -3.961341381072998, "logits_per_char": -0.9903353452682495, "num_chars": 8}, {"sum_logits": -8.621498107910156, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -13.92937183380127, "logits_per_token": -4.310749053955078, "logits_per_char": -0.9579442342122396, "num_chars": 9}, {"sum_logits": -8.3501615524292, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -12.793218612670898, "logits_per_token": -4.1750807762146, "logits_per_char": -0.7591055956753817, "num_chars": 11}, {"sum_logits": -6.006477355957031, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.84533977508545, "logits_per_token": -3.0032386779785156, "logits_per_char": -0.4620367196890024, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 569, "native_id": "VASoL_2009_3_23", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.969154357910156, "incorrect_loss_raw": 15.243754704793295, "correct_loss_per_char": 0.43230514526367186, "incorrect_loss_per_char": 0.45646304013388894, "correct_loss_per_token": 2.1615257263183594, "incorrect_loss_per_token": 2.287892871432834, "correct_loss_uncond": -19.18362045288086, "incorrect_loss_uncond": -19.202849706013996}, "model_output": [{"sum_logits": -14.389394760131836, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -35.036781311035156, "logits_per_token": -2.0556278228759766, "logits_per_char": -0.4496685862541199, "num_chars": 32}, {"sum_logits": -13.886917114257812, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.15864944458008, "logits_per_token": -2.3144861857096353, "logits_per_char": -0.44796506820186494, "num_chars": 31}, {"sum_logits": -17.454952239990234, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -35.14438247680664, "logits_per_token": -2.4935646057128906, "logits_per_char": -0.471755465945682, "num_chars": 37}, {"sum_logits": -12.969154357910156, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -32.152774810791016, "logits_per_token": -2.1615257263183594, "logits_per_char": -0.43230514526367186, "num_chars": 30}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 570, "native_id": "Mercury_SC_402276", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 34.02748489379883, "incorrect_loss_raw": 32.47808011372884, "correct_loss_per_char": 0.6543747094961313, "incorrect_loss_per_char": 0.842944507731583, "correct_loss_per_token": 2.8356237411499023, "incorrect_loss_per_token": 4.480029476875867, "correct_loss_uncond": -9.11611557006836, "incorrect_loss_uncond": -4.658824920654297}, "model_output": [{"sum_logits": -31.413488388061523, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.75077438354492, "logits_per_token": -4.4876411982945035, "logits_per_char": -1.0133383350987588, "num_chars": 31}, {"sum_logits": -27.04680633544922, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.09649085998535, "logits_per_token": -5.409361267089844, "logits_per_char": -0.7513001759847006, "num_chars": 36}, {"sum_logits": -38.97394561767578, "num_tokens": 11, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -47.56344985961914, "logits_per_token": -3.543085965243253, "logits_per_char": -0.7641950121112898, "num_chars": 51}, {"sum_logits": -34.02748489379883, "num_tokens": 12, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -43.14360046386719, "logits_per_token": -2.8356237411499023, "logits_per_char": -0.6543747094961313, "num_chars": 52}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 571, "native_id": "Mercury_400744", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.45434284210205, "incorrect_loss_raw": 16.657585779825848, "correct_loss_per_char": 2.064906120300293, "incorrect_loss_per_char": 2.625918751671201, "correct_loss_per_token": 3.6135857105255127, "incorrect_loss_per_token": 4.164396444956462, "correct_loss_uncond": -4.364039421081543, "incorrect_loss_uncond": -5.56411616007487}, "model_output": [{"sum_logits": -14.45434284210205, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.818382263183594, "logits_per_token": -3.6135857105255127, "logits_per_char": -2.064906120300293, "num_chars": 7}, {"sum_logits": -18.943538665771484, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.415803909301758, "logits_per_token": -4.735884666442871, "logits_per_char": -2.706219809395926, "num_chars": 7}, {"sum_logits": -15.182313919067383, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.4559383392334, "logits_per_token": -3.7955784797668457, "logits_per_char": -2.530385653177897, "num_chars": 6}, {"sum_logits": -15.846904754638672, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -20.793363571166992, "logits_per_token": -3.961726188659668, "logits_per_char": -2.641150792439779, "num_chars": 6}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 572, "native_id": "Mercury_SC_LBS10902", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.045186042785645, "incorrect_loss_raw": 10.322847366333008, "correct_loss_per_char": 0.6460847173418317, "incorrect_loss_per_char": 0.49577180453967956, "correct_loss_per_token": 3.0150620142618814, "incorrect_loss_per_token": 2.003909330519419, "correct_loss_uncond": -11.955370903015137, "incorrect_loss_uncond": -12.444249471028646}, "model_output": [{"sum_logits": -7.029877662658691, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -18.947715759277344, "logits_per_token": -2.343292554219564, "logits_per_char": -0.585823138554891, "num_chars": 12}, {"sum_logits": -9.045186042785645, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -21.00055694580078, "logits_per_token": -3.0150620142618814, "logits_per_char": -0.6460847173418317, "num_chars": 14}, {"sum_logits": -10.442301750183105, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.905864715576172, "logits_per_token": -1.7403836250305176, "logits_per_char": -0.4016269903916579, "num_chars": 26}, {"sum_logits": -13.496362686157227, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.447710037231445, "logits_per_token": -1.9280518123081751, "logits_per_char": -0.49986528467248986, "num_chars": 27}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 573, "native_id": "Mercury_7133245", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.60959243774414, "incorrect_loss_raw": 6.583945035934448, "correct_loss_per_char": 0.4099805922735305, "incorrect_loss_per_char": 0.3485690836237852, "correct_loss_per_token": 2.8698641459147134, "incorrect_loss_per_token": 2.65925231244829, "correct_loss_uncond": -10.425701141357422, "incorrect_loss_uncond": -11.985922733942667}, "model_output": [{"sum_logits": -11.38896369934082, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -23.316295623779297, "logits_per_token": -3.796321233113607, "logits_per_char": -0.5423316047305152, "num_chars": 21}, {"sum_logits": -8.60959243774414, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -19.035293579101562, "logits_per_token": -2.8698641459147134, "logits_per_char": -0.4099805922735305, "num_chars": 21}, {"sum_logits": -5.5831217765808105, "num_tokens": 2, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -17.30459213256836, "logits_per_token": -2.7915608882904053, "logits_per_char": -0.34894511103630066, "num_chars": 16}, {"sum_logits": -2.779749631881714, "num_tokens": 2, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -15.088715553283691, "logits_per_token": -1.389874815940857, "logits_per_char": -0.15443053510453966, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 574, "native_id": "Mercury_7131530", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.17039680480957, "incorrect_loss_raw": 23.198630650838215, "correct_loss_per_char": 0.3428376755624447, "incorrect_loss_per_char": 0.4513788997520835, "correct_loss_per_token": 2.0189329783121743, "incorrect_loss_per_token": 2.5270029385884603, "correct_loss_uncond": -19.65302085876465, "incorrect_loss_uncond": -23.428091049194336}, "model_output": [{"sum_logits": -19.173402786254883, "num_tokens": 10, "num_tokens_all": 259, "is_greedy": false, "sum_logits_uncond": -49.04608917236328, "logits_per_token": -1.9173402786254883, "logits_per_char": -0.32497292858059124, "num_chars": 59}, {"sum_logits": -18.17039680480957, "num_tokens": 9, "num_tokens_all": 258, "is_greedy": false, "sum_logits_uncond": -37.82341766357422, "logits_per_token": -2.0189329783121743, "logits_per_char": -0.3428376755624447, "num_chars": 53}, {"sum_logits": -24.85678482055664, "num_tokens": 8, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -43.70991516113281, "logits_per_token": -3.10709810256958, "logits_per_char": -0.5178496837615967, "num_chars": 48}, {"sum_logits": -25.565704345703125, "num_tokens": 10, "num_tokens_all": 259, "is_greedy": false, "sum_logits_uncond": -47.12416076660156, "logits_per_token": -2.5565704345703124, "logits_per_char": -0.5113140869140625, "num_chars": 50}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 575, "native_id": "Mercury_7041143", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.5484999418258667, "incorrect_loss_raw": 10.845577239990234, "correct_loss_per_char": 0.3871249854564667, "incorrect_loss_per_char": 2.511375331878662, "correct_loss_per_token": 0.5161666472752889, "incorrect_loss_per_token": 3.615192413330078, "correct_loss_uncond": -10.48538887500763, "incorrect_loss_uncond": -0.7947279612223307}, "model_output": [{"sum_logits": -9.472107887268066, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.288442611694336, "logits_per_token": -3.157369295756022, "logits_per_char": -2.3680269718170166, "num_chars": 4}, {"sum_logits": -11.063485145568848, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.929705619812012, "logits_per_token": -3.6878283818562827, "logits_per_char": -2.765871286392212, "num_chars": 4}, {"sum_logits": -1.5484999418258667, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -12.033888816833496, "logits_per_token": -0.5161666472752889, "logits_per_char": -0.3871249854564667, "num_chars": 4}, {"sum_logits": -12.001138687133789, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.702767372131348, "logits_per_token": -4.00037956237793, "logits_per_char": -2.400227737426758, "num_chars": 5}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 576, "native_id": "MCAS_2010_5_11984", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.91403579711914, "incorrect_loss_raw": 19.361223856608074, "correct_loss_per_char": 0.49570178985595703, "incorrect_loss_per_char": 0.6903739647259788, "correct_loss_per_token": 1.9828071594238281, "incorrect_loss_per_token": 2.946872895104544, "correct_loss_uncond": -16.08222007751465, "incorrect_loss_uncond": -12.23214022318522}, "model_output": [{"sum_logits": -9.91403579711914, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.99625587463379, "logits_per_token": -1.9828071594238281, "logits_per_char": -0.49570178985595703, "num_chars": 20}, {"sum_logits": -16.68393325805664, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -28.70812225341797, "logits_per_token": -3.336786651611328, "logits_per_char": -0.69516388575236, "num_chars": 24}, {"sum_logits": -18.416425704956055, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.913118362426758, "logits_per_token": -2.630917957850865, "logits_per_char": -0.6577294894627163, "num_chars": 28}, {"sum_logits": -22.983312606811523, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -36.158851623535156, "logits_per_token": -2.8729140758514404, "logits_per_char": -0.7182285189628601, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 577, "native_id": "Mercury_7159285", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 34.174072265625, "incorrect_loss_raw": 34.39105987548828, "correct_loss_per_char": 0.5995451274671053, "incorrect_loss_per_char": 0.6202674554706676, "correct_loss_per_token": 3.1067338423295454, "incorrect_loss_per_token": 3.694754761534852, "correct_loss_uncond": -8.306587219238281, "incorrect_loss_uncond": -9.001407623291016}, "model_output": [{"sum_logits": -32.81902313232422, "num_tokens": 7, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -42.836708068847656, "logits_per_token": -4.688431876046317, "logits_per_char": -0.6435102574965533, "num_chars": 51}, {"sum_logits": -43.687416076660156, "num_tokens": 11, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -49.91557693481445, "logits_per_token": -3.971583279696378, "logits_per_char": -0.7801324299403599, "num_chars": 56}, {"sum_logits": -34.174072265625, "num_tokens": 11, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -42.48065948486328, "logits_per_token": -3.1067338423295454, "logits_per_char": -0.5995451274671053, "num_chars": 57}, {"sum_logits": -26.66674041748047, "num_tokens": 11, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -37.42511749267578, "logits_per_token": -2.424249128861861, "logits_per_char": -0.43715967897508967, "num_chars": 61}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 578, "native_id": "AIMS_2008_8_13", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.137557983398438, "incorrect_loss_raw": 26.37742551167806, "correct_loss_per_char": 1.0065503801618303, "incorrect_loss_per_char": 1.6179815267587638, "correct_loss_per_token": 2.6421947479248047, "incorrect_loss_per_token": 3.606916927155994, "correct_loss_uncond": -12.244224548339844, "incorrect_loss_uncond": -7.184033075968425}, "model_output": [{"sum_logits": -32.17760467529297, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.989173889160156, "logits_per_token": -4.596800667898996, "logits_per_char": -2.298400333949498, "num_chars": 14}, {"sum_logits": -19.858503341674805, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.6278076171875, "logits_per_token": -2.836929048810686, "logits_per_char": -1.3239002227783203, "num_chars": 15}, {"sum_logits": -27.096168518066406, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.0673942565918, "logits_per_token": -3.387021064758301, "logits_per_char": -1.231644023548473, "num_chars": 22}, {"sum_logits": -21.137557983398438, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.38178253173828, "logits_per_token": -2.6421947479248047, "logits_per_char": -1.0065503801618303, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 579, "native_id": "MDSA_2013_8_20", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.837926864624023, "incorrect_loss_raw": 17.863101323445637, "correct_loss_per_char": 0.6269114535787831, "incorrect_loss_per_char": 0.5034495776951473, "correct_loss_per_token": 2.883792686462402, "incorrect_loss_per_token": 2.521153934418209, "correct_loss_uncond": -13.077978134155273, "incorrect_loss_uncond": -13.504298528035482}, "model_output": [{"sum_logits": -18.266218185424805, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -29.2255802154541, "logits_per_token": -2.6094597407749722, "logits_per_char": -0.6298695926008553, "num_chars": 29}, {"sum_logits": -12.92679214477539, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -28.14006233215332, "logits_per_token": -2.154465357462565, "logits_per_char": -0.40396225452423096, "num_chars": 32}, {"sum_logits": -28.837926864624023, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -41.9159049987793, "logits_per_token": -2.883792686462402, "logits_per_char": -0.6269114535787831, "num_chars": 46}, {"sum_logits": -22.39629364013672, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -36.73655700683594, "logits_per_token": -2.79953670501709, "logits_per_char": -0.47651688596035574, "num_chars": 47}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 580, "native_id": "Mercury_7114100", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.742353439331055, "incorrect_loss_raw": 10.957366307576498, "correct_loss_per_char": 0.4550840514046805, "incorrect_loss_per_char": 0.4385802552431417, "correct_loss_per_token": 2.1237255732218423, "incorrect_loss_per_token": 1.881613614824083, "correct_loss_uncond": -15.845636367797852, "incorrect_loss_uncond": -16.664404551188152}, "model_output": [{"sum_logits": -12.742353439331055, "num_tokens": 6, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -28.587989807128906, "logits_per_token": -2.1237255732218423, "logits_per_char": -0.4550840514046805, "num_chars": 28}, {"sum_logits": -13.578957557678223, "num_tokens": 6, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -29.616228103637695, "logits_per_token": -2.2631595929463706, "logits_per_char": -0.5903894590294879, "num_chars": 23}, {"sum_logits": -10.946568489074707, "num_tokens": 5, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -26.22506332397461, "logits_per_token": -2.1893136978149412, "logits_per_char": -0.4561070203781128, "num_chars": 24}, {"sum_logits": -8.346572875976562, "num_tokens": 7, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -27.02402114868164, "logits_per_token": -1.1923675537109375, "logits_per_char": -0.2692442863218246, "num_chars": 31}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 581, "native_id": "Mercury_7213343", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.09874439239502, "incorrect_loss_raw": 21.852793375651043, "correct_loss_per_char": 0.2581103347068609, "incorrect_loss_per_char": 0.4903465812226646, "correct_loss_per_token": 1.8497907320658367, "incorrect_loss_per_token": 2.990326337461118, "correct_loss_uncond": -23.34315776824951, "incorrect_loss_uncond": -19.67487335205078}, "model_output": [{"sum_logits": -11.09874439239502, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -34.44190216064453, "logits_per_token": -1.8497907320658367, "logits_per_char": -0.2581103347068609, "num_chars": 43}, {"sum_logits": -22.266891479492188, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -38.395172119140625, "logits_per_token": -2.47409905327691, "logits_per_char": -0.44533782958984375, "num_chars": 50}, {"sum_logits": -21.677310943603516, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -45.060447692871094, "logits_per_token": -4.335462188720703, "logits_per_char": -0.5558284857334235, "num_chars": 39}, {"sum_logits": -21.614177703857422, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -41.12738037109375, "logits_per_token": -2.161417770385742, "logits_per_char": -0.46987342834472656, "num_chars": 46}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 582, "native_id": "Mercury_SC_LBS10597", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.257962226867676, "incorrect_loss_raw": 15.919336001078287, "correct_loss_per_char": 0.8041401590619769, "incorrect_loss_per_char": 0.7656167028005597, "correct_loss_per_token": 3.7526540756225586, "incorrect_loss_per_token": 4.337008094787598, "correct_loss_uncond": -8.192662239074707, "incorrect_loss_uncond": -7.498801549275716}, "model_output": [{"sum_logits": -11.257962226867676, "num_tokens": 3, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -19.450624465942383, "logits_per_token": -3.7526540756225586, "logits_per_char": -0.8041401590619769, "num_chars": 14}, {"sum_logits": -7.8983564376831055, "num_tokens": 2, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -14.937764167785645, "logits_per_token": -3.9491782188415527, "logits_per_char": -0.5641683169773647, "num_chars": 14}, {"sum_logits": -21.798315048217773, "num_tokens": 4, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -27.76517105102539, "logits_per_token": -5.449578762054443, "logits_per_char": -1.0380150022960843, "num_chars": 21}, {"sum_logits": -18.061336517333984, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -27.551477432250977, "logits_per_token": -3.6122673034667967, "logits_per_char": -0.6946667891282302, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 583, "native_id": "Mercury_7126263", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.449745178222656, "incorrect_loss_raw": 17.316588401794434, "correct_loss_per_char": 0.42915958828396267, "incorrect_loss_per_char": 0.46700986648878734, "correct_loss_per_token": 3.089949035644531, "incorrect_loss_per_token": 3.463317680358887, "correct_loss_uncond": -16.205013275146484, "incorrect_loss_uncond": -11.0078706741333}, "model_output": [{"sum_logits": -19.25489044189453, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.515533447265625, "logits_per_token": -3.8509780883789064, "logits_per_char": -0.5663203071145451, "num_chars": 34}, {"sum_logits": -15.449745178222656, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.65475845336914, "logits_per_token": -3.089949035644531, "logits_per_char": -0.42915958828396267, "num_chars": 36}, {"sum_logits": -19.35727882385254, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -30.98841667175293, "logits_per_token": -3.871455764770508, "logits_per_char": -0.5094020743119089, "num_chars": 38}, {"sum_logits": -13.33759593963623, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.46942710876465, "logits_per_token": -2.667519187927246, "logits_per_char": -0.3253072180399081, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 584, "native_id": "Mercury_7133613", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 23.780763626098633, "incorrect_loss_raw": 24.50153350830078, "correct_loss_per_char": 0.5284614139133029, "incorrect_loss_per_char": 0.6236912026222575, "correct_loss_per_token": 2.972595453262329, "incorrect_loss_per_token": 2.920654279718954, "correct_loss_uncond": -27.778467178344727, "incorrect_loss_uncond": -13.828652699788412}, "model_output": [{"sum_logits": -23.780763626098633, "num_tokens": 8, "num_tokens_all": 273, "is_greedy": false, "sum_logits_uncond": -51.55923080444336, "logits_per_token": -2.972595453262329, "logits_per_char": -0.5284614139133029, "num_chars": 45}, {"sum_logits": -25.288288116455078, "num_tokens": 7, "num_tokens_all": 272, "is_greedy": false, "sum_logits_uncond": -37.57426452636719, "logits_per_token": -3.6126125880650113, "logits_per_char": -0.6167875150354897, "num_chars": 41}, {"sum_logits": -18.721601486206055, "num_tokens": 10, "num_tokens_all": 275, "is_greedy": false, "sum_logits_uncond": -40.31623840332031, "logits_per_token": -1.8721601486206054, "logits_per_char": -0.5349028996058873, "num_chars": 35}, {"sum_logits": -29.49471092224121, "num_tokens": 9, "num_tokens_all": 274, "is_greedy": false, "sum_logits_uncond": -37.10005569458008, "logits_per_token": -3.2771901024712458, "logits_per_char": -0.7193831932253953, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 585, "native_id": "Mercury_7234605", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.90362548828125, "incorrect_loss_raw": 12.625569025675455, "correct_loss_per_char": 0.4971654555376838, "incorrect_loss_per_char": 0.45541850013687335, "correct_loss_per_token": 2.8172709147135415, "incorrect_loss_per_token": 2.331629604763455, "correct_loss_uncond": -12.246129989624023, "incorrect_loss_uncond": -13.276021321614584}, "model_output": [{"sum_logits": -10.072837829589844, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.736989974975586, "logits_per_token": -2.014567565917969, "logits_per_char": -0.41970157623291016, "num_chars": 24}, {"sum_logits": -10.390291213989258, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -24.193843841552734, "logits_per_token": -2.0780582427978516, "logits_per_char": -0.38482560051812065, "num_chars": 27}, {"sum_logits": -17.413578033447266, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.773937225341797, "logits_per_token": -2.9022630055745444, "logits_per_char": -0.5617283236595892, "num_chars": 31}, {"sum_logits": -16.90362548828125, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.149755477905273, "logits_per_token": -2.8172709147135415, "logits_per_char": -0.4971654555376838, "num_chars": 34}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 586, "native_id": "Mercury_SC_400839", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.14099884033203, "incorrect_loss_raw": 21.11971950531006, "correct_loss_per_char": 0.6718888459382234, "incorrect_loss_per_char": 0.7505199779246041, "correct_loss_per_token": 3.6281997680664064, "incorrect_loss_per_token": 4.44125952720642, "correct_loss_uncond": -16.644222259521484, "incorrect_loss_uncond": -8.458523750305176}, "model_output": [{"sum_logits": -13.03893756866455, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.347068786621094, "logits_per_token": -3.2597343921661377, "logits_per_char": -0.5014975987947904, "num_chars": 26}, {"sum_logits": -18.14099884033203, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -34.785221099853516, "logits_per_token": -3.6281997680664064, "logits_per_char": -0.6718888459382234, "num_chars": 27}, {"sum_logits": -30.685379028320312, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -33.561622619628906, "logits_per_token": -6.137075805664063, "logits_per_char": -1.0228459676106771, "num_chars": 30}, {"sum_logits": -19.634841918945312, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -28.826038360595703, "logits_per_token": -3.9269683837890623, "logits_per_char": -0.7272163673683449, "num_chars": 27}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 587, "native_id": "Mercury_SC_402984", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.35214614868164, "incorrect_loss_raw": 13.222585360209147, "correct_loss_per_char": 0.630674786037869, "incorrect_loss_per_char": 0.5815178068007655, "correct_loss_per_token": 2.83803653717041, "incorrect_loss_per_token": 3.1552019497704884, "correct_loss_uncond": -9.776041030883789, "incorrect_loss_uncond": -9.790791511535645}, "model_output": [{"sum_logits": -9.739243507385254, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -20.767431259155273, "logits_per_token": -4.869621753692627, "logits_per_char": -0.7491725774911734, "num_chars": 13}, {"sum_logits": -11.35214614868164, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -21.12818717956543, "logits_per_token": -2.83803653717041, "logits_per_char": -0.630674786037869, "num_chars": 18}, {"sum_logits": -13.460256576538086, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -20.1380672454834, "logits_per_token": -2.243376096089681, "logits_per_char": -0.4641467785013133, "num_chars": 29}, {"sum_logits": -16.4682559967041, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.134632110595703, "logits_per_token": -2.3526079995291576, "logits_per_char": -0.5312340644098097, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 588, "native_id": "NYSEDREGENTS_2012_4_29", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.635135650634766, "incorrect_loss_raw": 20.777167002360027, "correct_loss_per_char": 0.5028752266092503, "incorrect_loss_per_char": 0.4641655270129362, "correct_loss_per_token": 2.1486486955122515, "incorrect_loss_per_token": 1.996935780843099, "correct_loss_uncond": -17.336265563964844, "incorrect_loss_uncond": -11.46708615620931}, "model_output": [{"sum_logits": -18.51169204711914, "num_tokens": 10, "num_tokens_all": 262, "is_greedy": false, "sum_logits_uncond": -28.402694702148438, "logits_per_token": -1.8511692047119142, "logits_per_char": -0.44075457255045575, "num_chars": 42}, {"sum_logits": -17.162105560302734, "num_tokens": 10, "num_tokens_all": 262, "is_greedy": false, "sum_logits_uncond": -28.381181716918945, "logits_per_token": -1.7162105560302734, "logits_per_char": -0.41858794049518866, "num_chars": 41}, {"sum_logits": -23.635135650634766, "num_tokens": 11, "num_tokens_all": 263, "is_greedy": false, "sum_logits_uncond": -40.97140121459961, "logits_per_token": -2.1486486955122515, "logits_per_char": -0.5028752266092503, "num_chars": 47}, {"sum_logits": -26.657703399658203, "num_tokens": 11, "num_tokens_all": 263, "is_greedy": false, "sum_logits_uncond": -39.948883056640625, "logits_per_token": -2.4234275817871094, "logits_per_char": -0.5331540679931641, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 589, "native_id": "VASoL_2009_3_22", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.547906875610352, "incorrect_loss_raw": 12.44143549601237, "correct_loss_per_char": 0.7534219196864537, "incorrect_loss_per_char": 0.9844089841085767, "correct_loss_per_token": 2.636976718902588, "incorrect_loss_per_token": 3.1103588740030923, "correct_loss_uncond": -7.884479522705078, "incorrect_loss_uncond": -4.796156565348308}, "model_output": [{"sum_logits": -11.53935432434082, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.584726333618164, "logits_per_token": -2.884838581085205, "logits_per_char": -0.961612860361735, "num_chars": 12}, {"sum_logits": -12.585870742797852, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.26430892944336, "logits_per_token": -3.146467685699463, "logits_per_char": -1.048822561899821, "num_chars": 12}, {"sum_logits": -13.199081420898438, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.863740921020508, "logits_per_token": -3.2997703552246094, "logits_per_char": -0.9427915300641742, "num_chars": 14}, {"sum_logits": -10.547906875610352, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.43238639831543, "logits_per_token": -2.636976718902588, "logits_per_char": -0.7534219196864537, "num_chars": 14}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 590, "native_id": "Mercury_409349", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.850669860839844, "incorrect_loss_raw": 17.83518123626709, "correct_loss_per_char": 0.5710878874126234, "incorrect_loss_per_char": 0.5983362883875629, "correct_loss_per_token": 2.712667465209961, "incorrect_loss_per_token": 3.4894231160481772, "correct_loss_uncond": -12.904199600219727, "incorrect_loss_uncond": -9.592442512512207}, "model_output": [{"sum_logits": -10.850669860839844, "num_tokens": 4, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -23.75486946105957, "logits_per_token": -2.712667465209961, "logits_per_char": -0.5710878874126234, "num_chars": 19}, {"sum_logits": -9.304072380065918, "num_tokens": 3, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -18.219568252563477, "logits_per_token": -3.1013574600219727, "logits_per_char": -0.4652036190032959, "num_chars": 20}, {"sum_logits": -18.12979507446289, "num_tokens": 6, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -27.536787033081055, "logits_per_token": -3.021632512410482, "logits_per_char": -0.6251653473952721, "num_chars": 29}, {"sum_logits": -26.07167625427246, "num_tokens": 6, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -36.52651596069336, "logits_per_token": -4.345279375712077, "logits_per_char": -0.7046398987641206, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 591, "native_id": "Mercury_SC_407417", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.774307250976562, "incorrect_loss_raw": 8.1597847143809, "correct_loss_per_char": 0.5849538167317708, "incorrect_loss_per_char": 0.5051568123398634, "correct_loss_per_token": 4.387153625488281, "incorrect_loss_per_token": 2.3839662869771323, "correct_loss_uncond": -12.597867965698242, "incorrect_loss_uncond": -11.496302525202433}, "model_output": [{"sum_logits": -3.7586376667022705, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.223793029785156, "logits_per_token": -1.25287922223409, "logits_per_char": -0.26847411905016216, "num_chars": 14}, {"sum_logits": -8.626086235046387, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -19.207141876220703, "logits_per_token": -2.8753620783487954, "logits_per_char": -0.5750724156697591, "num_chars": 15}, {"sum_logits": -12.094630241394043, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.53732681274414, "logits_per_token": -3.0236575603485107, "logits_per_char": -0.671923902299669, "num_chars": 18}, {"sum_logits": -8.774307250976562, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -21.372175216674805, "logits_per_token": -4.387153625488281, "logits_per_char": -0.5849538167317708, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 592, "native_id": "VASoL_2007_5_21", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.427396774291992, "incorrect_loss_raw": 9.482122103373209, "correct_loss_per_char": 1.053424596786499, "incorrect_loss_per_char": 1.1487559515332417, "correct_loss_per_token": 8.427396774291992, "incorrect_loss_per_token": 3.6604382197062173, "correct_loss_uncond": -2.840007781982422, "incorrect_loss_uncond": -4.422849655151367}, "model_output": [{"sum_logits": -7.241552352905273, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -11.898371696472168, "logits_per_token": -2.413850784301758, "logits_per_char": -0.8046169281005859, "num_chars": 9}, {"sum_logits": -12.2096586227417, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -16.975549697875977, "logits_per_token": -4.069886207580566, "logits_per_char": -1.3566287358601887, "num_chars": 9}, {"sum_logits": -8.427396774291992, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -11.267404556274414, "logits_per_token": -8.427396774291992, "logits_per_char": -1.053424596786499, "num_chars": 8}, {"sum_logits": -8.995155334472656, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -12.840993881225586, "logits_per_token": -4.497577667236328, "logits_per_char": -1.285022190638951, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 593, "native_id": "MCAS_2012_8_23651", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.554859161376953, "incorrect_loss_raw": 6.432221094767253, "correct_loss_per_char": 0.4272968585674579, "incorrect_loss_per_char": 0.5639833719302446, "correct_loss_per_token": 5.554859161376953, "incorrect_loss_per_token": 5.355439186096191, "correct_loss_uncond": -10.266521453857422, "incorrect_loss_uncond": -7.845673243204753}, "model_output": [{"sum_logits": -5.554859161376953, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -5.554859161376953, "logits_per_char": -0.4272968585674579, "num_chars": 13}, {"sum_logits": -5.317399978637695, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -5.317399978637695, "logits_per_char": -0.4431166648864746, "num_chars": 12}, {"sum_logits": -7.518571853637695, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.012836456298828, "logits_per_token": -7.518571853637695, "logits_per_char": -0.7518571853637696, "num_chars": 10}, {"sum_logits": -6.460691452026367, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -15.563446044921875, "logits_per_token": -3.2303457260131836, "logits_per_char": -0.4969762655404898, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 594, "native_id": "MCAS_2000_4_26", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.928922653198242, "incorrect_loss_raw": 18.634594917297363, "correct_loss_per_char": 0.4548933798806709, "incorrect_loss_per_char": 0.4997592236825615, "correct_loss_per_token": 2.3571747866543857, "incorrect_loss_per_token": 2.2856832796546276, "correct_loss_uncond": -23.259798049926758, "incorrect_loss_uncond": -19.69593334197998}, "model_output": [{"sum_logits": -19.799976348876953, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -33.57280731201172, "logits_per_token": -2.8285680498395647, "logits_per_char": -0.682757805133688, "num_chars": 29}, {"sum_logits": -13.942675590515137, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -35.72378158569336, "logits_per_token": -2.323779265085856, "logits_per_char": -0.4647558530171712, "num_chars": 30}, {"sum_logits": -22.1611328125, "num_tokens": 13, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -45.69499588012695, "logits_per_token": -1.7047025240384615, "logits_per_char": -0.3517640128968254, "num_chars": 63}, {"sum_logits": -25.928922653198242, "num_tokens": 11, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -49.188720703125, "logits_per_token": -2.3571747866543857, "logits_per_char": -0.4548933798806709, "num_chars": 57}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 595, "native_id": "Mercury_SC_410971", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.297191619873047, "incorrect_loss_raw": 24.210981369018555, "correct_loss_per_char": 0.7085735486901324, "incorrect_loss_per_char": 0.6685416574237727, "correct_loss_per_token": 3.259438323974609, "incorrect_loss_per_token": 3.202355611891974, "correct_loss_uncond": -11.92241096496582, "incorrect_loss_uncond": -8.361040115356445}, "model_output": [{"sum_logits": -16.297191619873047, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -28.219602584838867, "logits_per_token": -3.259438323974609, "logits_per_char": -0.7085735486901324, "num_chars": 23}, {"sum_logits": -20.132600784301758, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -29.994102478027344, "logits_per_token": -2.8760858263288225, "logits_per_char": -0.5921353171853458, "num_chars": 34}, {"sum_logits": -28.274700164794922, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -34.216434478759766, "logits_per_token": -4.039242880684989, "logits_per_char": -0.8078485761369978, "num_chars": 35}, {"sum_logits": -24.225643157958984, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -33.50552749633789, "logits_per_token": -2.6917381286621094, "logits_per_char": -0.6056410789489746, "num_chars": 40}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 596, "native_id": "Mercury_404841", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 31.56537437438965, "incorrect_loss_raw": 30.47022819519043, "correct_loss_per_char": 1.0521791458129883, "incorrect_loss_per_char": 1.0156742731730144, "correct_loss_per_token": 4.5093391963413785, "incorrect_loss_per_token": 4.352889742170061, "correct_loss_uncond": -12.993692398071289, "incorrect_loss_uncond": -13.735644658406576}, "model_output": [{"sum_logits": -28.65151596069336, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -43.12389373779297, "logits_per_token": -4.09307370867048, "logits_per_char": -0.955050532023112, "num_chars": 30}, {"sum_logits": -30.955549240112305, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -44.89507293701172, "logits_per_token": -4.422221320016043, "logits_per_char": -1.0318516413370769, "num_chars": 30}, {"sum_logits": -31.803619384765625, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -44.59865188598633, "logits_per_token": -4.543374197823661, "logits_per_char": -1.0601206461588542, "num_chars": 30}, {"sum_logits": -31.56537437438965, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -44.55906677246094, "logits_per_token": -4.5093391963413785, "logits_per_char": -1.0521791458129883, "num_chars": 30}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 597, "native_id": "Mercury_416651", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.620904922485352, "incorrect_loss_raw": 6.11159086227417, "correct_loss_per_char": 1.8029864174979073, "incorrect_loss_per_char": 0.7190631695123978, "correct_loss_per_token": 4.206968307495117, "incorrect_loss_per_token": 1.810415373908149, "correct_loss_uncond": -5.244117736816406, "incorrect_loss_uncond": -11.280387083689371}, "model_output": [{"sum_logits": -12.620904922485352, "num_tokens": 3, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -17.865022659301758, "logits_per_token": -4.206968307495117, "logits_per_char": -1.8029864174979073, "num_chars": 7}, {"sum_logits": -4.08233118057251, "num_tokens": 3, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -16.281841278076172, "logits_per_token": -1.3607770601908367, "logits_per_char": -0.5831901686532157, "num_chars": 7}, {"sum_logits": -8.16413688659668, "num_tokens": 4, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -18.686491012573242, "logits_per_token": -2.04103422164917, "logits_per_char": -1.020517110824585, "num_chars": 8}, {"sum_logits": -6.08830451965332, "num_tokens": 3, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -17.20760154724121, "logits_per_token": -2.02943483988444, "logits_per_char": -0.5534822290593927, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 598, "native_id": "Mercury_416576", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 21.656810760498047, "incorrect_loss_raw": 23.135251998901367, "correct_loss_per_char": 0.4164771300095778, "incorrect_loss_per_char": 0.4360324004700214, "correct_loss_per_token": 1.6659085200383112, "incorrect_loss_per_token": 1.5920145511627197, "correct_loss_uncond": -15.944026947021484, "incorrect_loss_uncond": -17.08433214823405}, "model_output": [{"sum_logits": -21.032827377319336, "num_tokens": 12, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -40.621429443359375, "logits_per_token": -1.7527356147766113, "logits_per_char": -0.4292413750473334, "num_chars": 49}, {"sum_logits": -21.656810760498047, "num_tokens": 13, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -37.60083770751953, "logits_per_token": -1.6659085200383112, "logits_per_char": -0.4164771300095778, "num_chars": 52}, {"sum_logits": -22.760936737060547, "num_tokens": 16, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -42.77983093261719, "logits_per_token": -1.4225585460662842, "logits_per_char": -0.4214988284640842, "num_chars": 54}, {"sum_logits": -25.61199188232422, "num_tokens": 16, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -37.25749206542969, "logits_per_token": -1.6007494926452637, "logits_per_char": -0.45735699789864676, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 599, "native_id": "MCAS_1998_8_24", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 28.732154846191406, "incorrect_loss_raw": 26.678260167439777, "correct_loss_per_char": 0.29929327964782715, "incorrect_loss_per_char": 0.4139675788249169, "correct_loss_per_token": 1.690126755658318, "incorrect_loss_per_token": 2.316650286738909, "correct_loss_uncond": -19.123306274414062, "incorrect_loss_uncond": -15.710551579793295}, "model_output": [{"sum_logits": -16.259464263916016, "num_tokens": 7, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -35.078861236572266, "logits_per_token": -2.3227806091308594, "logits_per_char": -0.3387388388315837, "num_chars": 48}, {"sum_logits": -47.027915954589844, "num_tokens": 17, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -59.548683166503906, "logits_per_token": -2.7663479973288143, "logits_per_char": -0.5468362320301144, "num_chars": 86}, {"sum_logits": -16.747400283813477, "num_tokens": 9, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -32.53889083862305, "logits_per_token": -1.860822253757053, "logits_per_char": -0.3563276656130527, "num_chars": 47}, {"sum_logits": -28.732154846191406, "num_tokens": 17, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -47.85546112060547, "logits_per_token": -1.690126755658318, "logits_per_char": -0.29929327964782715, "num_chars": 96}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 600, "native_id": "Mercury_SC_408367", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.307867050170898, "incorrect_loss_raw": 13.3947966893514, "correct_loss_per_char": 1.0205244700113931, "incorrect_loss_per_char": 0.8994647796980925, "correct_loss_per_token": 5.102622350056966, "incorrect_loss_per_token": 4.10338905122545, "correct_loss_uncond": -8.217012405395508, "incorrect_loss_uncond": -8.848392804463705}, "model_output": [{"sum_logits": -13.237595558166504, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -20.68524169921875, "logits_per_token": -4.412531852722168, "logits_per_char": -1.0182765813974233, "num_chars": 13}, {"sum_logits": -15.307867050170898, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -23.524879455566406, "logits_per_token": -5.102622350056966, "logits_per_char": -1.0205244700113931, "num_chars": 15}, {"sum_logits": -13.93124008178711, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.019615173339844, "logits_per_token": -4.643746693929036, "logits_per_char": -0.9950885772705078, "num_chars": 14}, {"sum_logits": -13.015554428100586, "num_tokens": 4, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -27.02471160888672, "logits_per_token": -3.2538886070251465, "logits_per_char": -0.6850291804263466, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 601, "native_id": "Mercury_405804", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 2.795156478881836, "incorrect_loss_raw": 4.18875773747762, "correct_loss_per_char": 0.46585941314697266, "incorrect_loss_per_char": 0.664942643377516, "correct_loss_per_token": 2.795156478881836, "incorrect_loss_per_token": 3.2480316162109375, "correct_loss_uncond": -7.093837738037109, "incorrect_loss_uncond": -8.733040968577066}, "model_output": [{"sum_logits": -2.801631450653076, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -11.566265106201172, "logits_per_token": -2.801631450653076, "logits_per_char": -0.700407862663269, "num_chars": 4}, {"sum_logits": -4.1202850341796875, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -4.1202850341796875, "logits_per_char": -0.8240570068359375, "num_chars": 5}, {"sum_logits": -5.644356727600098, "num_tokens": 2, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -14.735466003417969, "logits_per_token": -2.822178363800049, "logits_per_char": -0.4703630606333415, "num_chars": 12}, {"sum_logits": -2.795156478881836, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -2.795156478881836, "logits_per_char": -0.46585941314697266, "num_chars": 6}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 602, "native_id": "Mercury_7216318", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.218097686767578, "incorrect_loss_raw": 13.380838394165039, "correct_loss_per_char": 0.4367749474265359, "incorrect_loss_per_char": 0.29715750890232745, "correct_loss_per_token": 2.4022622108459473, "incorrect_loss_per_token": 1.4867598215738933, "correct_loss_uncond": -19.111587524414062, "incorrect_loss_uncond": -21.6317195892334}, "model_output": [{"sum_logits": -19.218097686767578, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -38.32968521118164, "logits_per_token": -2.4022622108459473, "logits_per_char": -0.4367749474265359, "num_chars": 44}, {"sum_logits": -15.754430770874023, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -33.34638214111328, "logits_per_token": -1.7504923078748915, "logits_per_char": -0.36638211095055867, "num_chars": 43}, {"sum_logits": -11.803844451904297, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -37.67982482910156, "logits_per_token": -1.3115382724338107, "logits_per_char": -0.268269192088734, "num_chars": 44}, {"sum_logits": -12.584239959716797, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -34.01146697998047, "logits_per_token": -1.3982488844129775, "logits_per_char": -0.2568212236676897, "num_chars": 49}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 603, "native_id": "Mercury_401312", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.45127296447754, "incorrect_loss_raw": 22.60460090637207, "correct_loss_per_char": 0.47442129181652537, "incorrect_loss_per_char": 0.7257467326487453, "correct_loss_per_token": 3.2418788274129233, "incorrect_loss_per_token": 3.8047705998496397, "correct_loss_uncond": -21.30965232849121, "incorrect_loss_uncond": -12.469817479451498}, "model_output": [{"sum_logits": -21.011531829833984, "num_tokens": 5, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -31.282337188720703, "logits_per_token": -4.2023063659667965, "logits_per_char": -0.7782048825864438, "num_chars": 27}, {"sum_logits": -22.09060287475586, "num_tokens": 6, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -36.853477478027344, "logits_per_token": -3.681767145792643, "logits_per_char": -0.71260009273406, "num_chars": 31}, {"sum_logits": -24.711668014526367, "num_tokens": 7, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -37.087440490722656, "logits_per_token": -3.530238287789481, "logits_per_char": -0.6864352226257324, "num_chars": 36}, {"sum_logits": -19.45127296447754, "num_tokens": 6, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -40.76092529296875, "logits_per_token": -3.2418788274129233, "logits_per_char": -0.47442129181652537, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 604, "native_id": "MDSA_2013_8_23", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 19.62872314453125, "incorrect_loss_raw": 25.257666269938152, "correct_loss_per_char": 0.363494873046875, "incorrect_loss_per_char": 0.5407736663843526, "correct_loss_per_token": 2.18096923828125, "incorrect_loss_per_token": 3.3016529587841537, "correct_loss_uncond": -23.71866226196289, "incorrect_loss_uncond": -21.693180084228516}, "model_output": [{"sum_logits": -23.80507469177246, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -43.954612731933594, "logits_per_token": -3.4007249559674944, "logits_per_char": -0.5536063881807549, "num_chars": 43}, {"sum_logits": -22.995634078979492, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -42.622535705566406, "logits_per_token": -3.285090582711356, "logits_per_char": -0.489268810191053, "num_chars": 47}, {"sum_logits": -28.9722900390625, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -54.275390625, "logits_per_token": -3.219143337673611, "logits_per_char": -0.57944580078125, "num_chars": 50}, {"sum_logits": -19.62872314453125, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -43.34738540649414, "logits_per_token": -2.18096923828125, "logits_per_char": -0.363494873046875, "num_chars": 54}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 605, "native_id": "Mercury_SC_405880", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.463521957397461, "incorrect_loss_raw": 13.601833979288736, "correct_loss_per_char": 0.5507116819682875, "incorrect_loss_per_char": 0.43760815159105154, "correct_loss_per_token": 2.0927043914794923, "incorrect_loss_per_token": 2.1472726549421037, "correct_loss_uncond": -12.564664840698242, "incorrect_loss_uncond": -11.582462310791016}, "model_output": [{"sum_logits": -10.463521957397461, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -23.028186798095703, "logits_per_token": -2.0927043914794923, "logits_per_char": -0.5507116819682875, "num_chars": 19}, {"sum_logits": -11.329672813415527, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -22.962919235229492, "logits_per_token": -1.8882788022359211, "logits_per_char": -0.4196175116079825, "num_chars": 27}, {"sum_logits": -14.393670082092285, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.617109298706055, "logits_per_token": -2.3989450136820474, "logits_per_char": -0.4361718206694632, "num_chars": 33}, {"sum_logits": -15.082159042358398, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.97286033630371, "logits_per_token": -2.1545941489083424, "logits_per_char": -0.45703512249570905, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 606, "native_id": "ACTAAP_2009_5_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.346402645111084, "incorrect_loss_raw": 4.18382469813029, "correct_loss_per_char": 0.7244004408518473, "incorrect_loss_per_char": 0.7532487407563225, "correct_loss_per_token": 4.346402645111084, "incorrect_loss_per_token": 4.18382469813029, "correct_loss_uncond": -3.510714530944824, "incorrect_loss_uncond": -4.361130237579346}, "model_output": [{"sum_logits": -3.556973457336426, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -7.943788528442383, "logits_per_token": -3.556973457336426, "logits_per_char": -0.8892433643341064, "num_chars": 4}, {"sum_logits": -4.346402645111084, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -7.857117176055908, "logits_per_token": -4.346402645111084, "logits_per_char": -0.7244004408518473, "num_chars": 6}, {"sum_logits": -3.5941162109375, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -8.293561935424805, "logits_per_token": -3.5941162109375, "logits_per_char": -0.5990193684895834, "num_chars": 6}, {"sum_logits": -5.400384426116943, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -9.397514343261719, "logits_per_token": -5.400384426116943, "logits_per_char": -0.7714834894452777, "num_chars": 7}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 607, "native_id": "CSZ20754", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.815611362457275, "incorrect_loss_raw": 5.372401714324951, "correct_loss_per_char": 0.6012008740351751, "incorrect_loss_per_char": 0.402884031210293, "correct_loss_per_token": 3.9078056812286377, "incorrect_loss_per_token": 2.6862008571624756, "correct_loss_uncond": -5.408448696136475, "incorrect_loss_uncond": -8.579581419626871}, "model_output": [{"sum_logits": -4.927105903625488, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.765954971313477, "logits_per_token": -2.463552951812744, "logits_per_char": -0.49271059036254883, "num_chars": 10}, {"sum_logits": -7.815611362457275, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.22406005859375, "logits_per_token": -3.9078056812286377, "logits_per_char": -0.6012008740351751, "num_chars": 13}, {"sum_logits": -4.5775628089904785, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.310808181762695, "logits_per_token": -2.2887814044952393, "logits_per_char": -0.3269687720707485, "num_chars": 14}, {"sum_logits": -6.612536430358887, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.779186248779297, "logits_per_token": -3.3062682151794434, "logits_per_char": -0.3889727311975816, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 608, "native_id": "Mercury_184363", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.25912094116211, "incorrect_loss_raw": 31.12714385986328, "correct_loss_per_char": 0.6225344881098321, "incorrect_loss_per_char": 0.7005542962126029, "correct_loss_per_token": 3.251013437906901, "incorrect_loss_per_token": 3.304757888228805, "correct_loss_uncond": -3.596546173095703, "incorrect_loss_uncond": -5.109862009684245}, "model_output": [{"sum_logits": -25.658065795898438, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -30.610389709472656, "logits_per_token": -2.850896199544271, "logits_per_char": -0.5966992045557776, "num_chars": 43}, {"sum_logits": -26.193679809570312, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.03450393676758, "logits_per_token": -2.910408867730035, "logits_per_char": -0.582081773546007, "num_chars": 45}, {"sum_logits": -41.529685974121094, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -43.066123962402344, "logits_per_token": -4.1529685974121096, "logits_per_char": -0.9228819105360243, "num_chars": 45}, {"sum_logits": -29.25912094116211, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -32.85566711425781, "logits_per_token": -3.251013437906901, "logits_per_char": -0.6225344881098321, "num_chars": 47}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 609, "native_id": "Mercury_7188195", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.9663000106811523, "incorrect_loss_raw": 4.011878331502278, "correct_loss_per_char": 0.36057272824374115, "incorrect_loss_per_char": 0.38450802629150216, "correct_loss_per_token": 1.9831500053405762, "incorrect_loss_per_token": 2.005939165751139, "correct_loss_uncond": -10.572659492492676, "incorrect_loss_uncond": -10.190603892008463}, "model_output": [{"sum_logits": -3.514798641204834, "num_tokens": 2, "num_tokens_all": 228, "is_greedy": true, "sum_logits_uncond": -14.320663452148438, "logits_per_token": -1.757399320602417, "logits_per_char": -0.3905331823560927, "num_chars": 9}, {"sum_logits": -4.66015100479126, "num_tokens": 2, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -12.36876106262207, "logits_per_token": -2.33007550239563, "logits_per_char": -0.466015100479126, "num_chars": 10}, {"sum_logits": -3.9663000106811523, "num_tokens": 2, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -14.538959503173828, "logits_per_token": -1.9831500053405762, "logits_per_char": -0.36057272824374115, "num_chars": 11}, {"sum_logits": -3.860685348510742, "num_tokens": 2, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -15.918022155761719, "logits_per_token": -1.930342674255371, "logits_per_char": -0.29697579603928786, "num_chars": 13}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 610, "native_id": "Mercury_7221043", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.880176544189453, "incorrect_loss_raw": 29.971355438232422, "correct_loss_per_char": 0.37085044990151617, "incorrect_loss_per_char": 0.5674878080031301, "correct_loss_per_token": 2.4311307271321616, "incorrect_loss_per_token": 3.1261674103913486, "correct_loss_uncond": -19.016860961914062, "incorrect_loss_uncond": -12.435203552246094}, "model_output": [{"sum_logits": -29.603696823120117, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -41.60106658935547, "logits_per_token": -2.960369682312012, "logits_per_char": -0.6298658898536195, "num_chars": 47}, {"sum_logits": -25.471765518188477, "num_tokens": 10, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -38.561546325683594, "logits_per_token": -2.547176551818848, "logits_per_char": -0.43916837100324957, "num_chars": 58}, {"sum_logits": -34.83860397338867, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -47.057064056396484, "logits_per_token": -3.8709559970431857, "logits_per_char": -0.6334291631525213, "num_chars": 55}, {"sum_logits": -21.880176544189453, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -40.897037506103516, "logits_per_token": -2.4311307271321616, "logits_per_char": -0.37085044990151617, "num_chars": 59}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 611, "native_id": "Mercury_7107328", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.764298439025879, "incorrect_loss_raw": 18.353710810343426, "correct_loss_per_char": 0.38443922996520996, "incorrect_loss_per_char": 0.5779655889925436, "correct_loss_per_token": 1.5377569198608398, "incorrect_loss_per_token": 3.095416122012668, "correct_loss_uncond": -8.366301536560059, "incorrect_loss_uncond": -9.46892229715983}, "model_output": [{"sum_logits": -12.376535415649414, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -23.134132385253906, "logits_per_token": -3.0941338539123535, "logits_per_char": -0.47602059290959287, "num_chars": 26}, {"sum_logits": -10.764298439025879, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.130599975585938, "logits_per_token": -1.5377569198608398, "logits_per_char": -0.38443922996520996, "num_chars": 28}, {"sum_logits": -22.127639770507812, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -31.15070343017578, "logits_per_token": -2.7659549713134766, "logits_per_char": -0.6705345385002367, "num_chars": 33}, {"sum_logits": -20.556957244873047, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -29.183063507080078, "logits_per_token": -3.4261595408121743, "logits_per_char": -0.5873416355678014, "num_chars": 35}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 612, "native_id": "Mercury_415084", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.944722175598145, "incorrect_loss_raw": 11.608896414438883, "correct_loss_per_char": 1.4206745965140206, "incorrect_loss_per_char": 1.3137252600735456, "correct_loss_per_token": 2.486180543899536, "incorrect_loss_per_token": 2.1347346703211465, "correct_loss_uncond": -12.984721183776855, "incorrect_loss_uncond": -10.874950885772705}, "model_output": [{"sum_logits": -13.505491256713867, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -24.78647232055664, "logits_per_token": -2.250915209452311, "logits_per_char": -1.500610139634874, "num_chars": 9}, {"sum_logits": -7.1970696449279785, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -17.083110809326172, "logits_per_token": -1.7992674112319946, "logits_per_char": -1.0281528064182825, "num_chars": 7}, {"sum_logits": -9.944722175598145, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -22.929443359375, "logits_per_token": -2.486180543899536, "logits_per_char": -1.4206745965140206, "num_chars": 7}, {"sum_logits": -14.124128341674805, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -25.581958770751953, "logits_per_token": -2.3540213902791343, "logits_per_char": -1.4124128341674804, "num_chars": 10}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 613, "native_id": "Mercury_415082", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.747979164123535, "incorrect_loss_raw": 8.76526133219401, "correct_loss_per_char": 1.1246631940205891, "incorrect_loss_per_char": 1.1432806083134242, "correct_loss_per_token": 1.6869947910308838, "incorrect_loss_per_token": 2.1913153330485025, "correct_loss_uncond": -11.144463539123535, "incorrect_loss_uncond": -9.2648073832194}, "model_output": [{"sum_logits": -6.747979164123535, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.89244270324707, "logits_per_token": -1.6869947910308838, "logits_per_char": -1.1246631940205891, "num_chars": 6}, {"sum_logits": -8.000654220581055, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.78717803955078, "logits_per_token": -2.0001635551452637, "logits_per_char": -1.1429506029401506, "num_chars": 7}, {"sum_logits": -8.263976097106934, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -17.53017234802246, "logits_per_token": -2.0659940242767334, "logits_per_char": -1.0329970121383667, "num_chars": 8}, {"sum_logits": -10.031153678894043, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -18.772855758666992, "logits_per_token": -2.5077884197235107, "logits_per_char": -1.2538942098617554, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 614, "native_id": "Mercury_SC_416169", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 12.463052749633789, "incorrect_loss_raw": 11.604997952779135, "correct_loss_per_char": 0.6923918194240994, "incorrect_loss_per_char": 0.6985421671586879, "correct_loss_per_token": 3.1157631874084473, "incorrect_loss_per_token": 2.9012494881947837, "correct_loss_uncond": -8.351163864135742, "incorrect_loss_uncond": -6.075705846150716}, "model_output": [{"sum_logits": -12.463052749633789, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -20.81421661376953, "logits_per_token": -3.1157631874084473, "logits_per_char": -0.6923918194240994, "num_chars": 18}, {"sum_logits": -9.65960693359375, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.442193031311035, "logits_per_token": -2.4149017333984375, "logits_per_char": -0.5682121725643382, "num_chars": 17}, {"sum_logits": -12.184880256652832, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -17.572620391845703, "logits_per_token": -3.046220064163208, "logits_per_char": -0.716757662156049, "num_chars": 17}, {"sum_logits": -12.97050666809082, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -20.027297973632812, "logits_per_token": -3.242626667022705, "logits_per_char": -0.8106566667556763, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 615, "native_id": "MEA_2011_8_13", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.86837387084961, "incorrect_loss_raw": 21.258684158325195, "correct_loss_per_char": 0.5085668342058048, "incorrect_loss_per_char": 0.46181066666490533, "correct_loss_per_token": 2.186837387084961, "incorrect_loss_per_token": 2.2137648352870234, "correct_loss_uncond": -20.97665786743164, "incorrect_loss_uncond": -24.214022954305012}, "model_output": [{"sum_logits": -21.86837387084961, "num_tokens": 10, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -42.84503173828125, "logits_per_token": -2.186837387084961, "logits_per_char": -0.5085668342058048, "num_chars": 43}, {"sum_logits": -17.582202911376953, "num_tokens": 9, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -42.52470016479492, "logits_per_token": -1.9535781012641058, "logits_per_char": -0.3588204675791215, "num_chars": 49}, {"sum_logits": -26.07635498046875, "num_tokens": 12, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -49.99468994140625, "logits_per_token": -2.173029581705729, "logits_per_char": -0.48289546260127314, "num_chars": 54}, {"sum_logits": -20.117494583129883, "num_tokens": 8, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -43.89873123168945, "logits_per_token": -2.5146868228912354, "logits_per_char": -0.5437160698143212, "num_chars": 37}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 616, "native_id": "TIMSS_2003_4_pg82", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 37.20125198364258, "incorrect_loss_raw": 26.55582110087077, "correct_loss_per_char": 1.0054392428011507, "incorrect_loss_per_char": 0.6145800945742607, "correct_loss_per_token": 3.720125198364258, "incorrect_loss_per_token": 2.5708415311286252, "correct_loss_uncond": -6.098644256591797, "incorrect_loss_uncond": -2.384995142618815}, "model_output": [{"sum_logits": -37.20125198364258, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -43.299896240234375, "logits_per_token": -3.720125198364258, "logits_per_char": -1.0054392428011507, "num_chars": 37}, {"sum_logits": -33.327911376953125, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -35.20099639892578, "logits_per_token": -2.563685490534856, "logits_per_char": -0.6171835440176504, "num_chars": 54}, {"sum_logits": -20.401140213012695, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -25.295612335205078, "logits_per_token": -2.2667933570014105, "logits_per_char": -0.46366227756847034, "num_chars": 44}, {"sum_logits": -25.938411712646484, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.32583999633789, "logits_per_token": -2.8820457458496094, "logits_per_char": -0.7628944621366613, "num_chars": 34}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 617, "native_id": "CSZ30338", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.360856056213379, "incorrect_loss_raw": 5.571576118469238, "correct_loss_per_char": 0.49072373708089195, "incorrect_loss_per_char": 0.3922893231313889, "correct_loss_per_token": 1.8402140140533447, "incorrect_loss_per_token": 2.1256648699442544, "correct_loss_uncond": -11.703143119812012, "incorrect_loss_uncond": -12.914924303690592}, "model_output": [{"sum_logits": -5.0294671058654785, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -17.844335556030273, "logits_per_token": -2.5147335529327393, "logits_per_char": -0.3592476504189627, "num_chars": 14}, {"sum_logits": -3.7637829780578613, "num_tokens": 2, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -18.514751434326172, "logits_per_token": -1.8818914890289307, "logits_per_char": -0.2895217675429124, "num_chars": 13}, {"sum_logits": -7.360856056213379, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -19.06399917602539, "logits_per_token": -1.8402140140533447, "logits_per_char": -0.49072373708089195, "num_chars": 15}, {"sum_logits": -7.921478271484375, "num_tokens": 4, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -19.100414276123047, "logits_per_token": -1.9803695678710938, "logits_per_char": -0.5280985514322917, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 618, "native_id": "TIMSS_2003_8_pg85", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.863619804382324, "incorrect_loss_raw": 10.807002385457357, "correct_loss_per_char": 0.43454479217529296, "incorrect_loss_per_char": 0.5289066810628552, "correct_loss_per_token": 2.172723960876465, "incorrect_loss_per_token": 3.1802115970187717, "correct_loss_uncond": -16.77975368499756, "incorrect_loss_uncond": -13.725299199422201}, "model_output": [{"sum_logits": -9.497756958007812, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -24.978656768798828, "logits_per_token": -1.8995513916015625, "logits_per_char": -0.3392056056431362, "num_chars": 28}, {"sum_logits": -10.863619804382324, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.643373489379883, "logits_per_token": -2.172723960876465, "logits_per_char": -0.43454479217529296, "num_chars": 25}, {"sum_logits": -12.058846473693848, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -25.07942771911621, "logits_per_token": -4.019615491231282, "logits_per_char": -0.7536779046058655, "num_chars": 16}, {"sum_logits": -10.86440372467041, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -23.538820266723633, "logits_per_token": -3.62146790822347, "logits_per_char": -0.4938365329395641, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 619, "native_id": "Mercury_7221988", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.14697265625, "incorrect_loss_raw": 15.02824624379476, "correct_loss_per_char": 0.3184849330357143, "incorrect_loss_per_char": 0.5284231083633812, "correct_loss_per_token": 1.5924246651785714, "incorrect_loss_per_token": 2.5519396554856075, "correct_loss_uncond": -17.52495765686035, "incorrect_loss_uncond": -11.10491975148519}, "model_output": [{"sum_logits": -13.563201904296875, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.530853271484375, "logits_per_token": -2.712640380859375, "logits_per_char": -0.6165091774680398, "num_chars": 22}, {"sum_logits": -18.48427963256836, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.199804306030273, "logits_per_token": -3.0807132720947266, "logits_per_char": -0.59626708492156, "num_chars": 31}, {"sum_logits": -11.14697265625, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -28.67193031311035, "logits_per_token": -1.5924246651785714, "logits_per_char": -0.3184849330357143, "num_chars": 35}, {"sum_logits": -13.037257194519043, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -27.668840408325195, "logits_per_token": -1.8624653135027205, "logits_per_char": -0.3724930627005441, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 620, "native_id": "NCEOGA_2013_5_11", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.0444464683532715, "incorrect_loss_raw": 6.866779327392578, "correct_loss_per_char": 0.46495742064255935, "incorrect_loss_per_char": 0.5095252642555842, "correct_loss_per_token": 3.0222232341766357, "incorrect_loss_per_token": 4.651311079661052, "correct_loss_uncond": -9.53612756729126, "incorrect_loss_uncond": -10.56440289815267}, "model_output": [{"sum_logits": -7.307528495788574, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -14.193122863769531, "logits_per_token": -7.307528495788574, "logits_per_char": -0.5219663211277553, "num_chars": 14}, {"sum_logits": -7.225330352783203, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -19.56061363220215, "logits_per_token": -3.6126651763916016, "logits_per_char": -0.6021108627319336, "num_chars": 12}, {"sum_logits": -6.067479133605957, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -18.539810180664062, "logits_per_token": -3.0337395668029785, "logits_per_char": -0.4044986089070638, "num_chars": 15}, {"sum_logits": -6.0444464683532715, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.580574035644531, "logits_per_token": -3.0222232341766357, "logits_per_char": -0.46495742064255935, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 621, "native_id": "MCAS_2013_8_29416", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.543253898620605, "incorrect_loss_raw": 11.80377991994222, "correct_loss_per_char": 0.3904908851340965, "incorrect_loss_per_char": 0.6558197736740112, "correct_loss_per_token": 2.108650779724121, "incorrect_loss_per_token": 2.3607559839884438, "correct_loss_uncond": -15.486123085021973, "incorrect_loss_uncond": -12.900822003682455}, "model_output": [{"sum_logits": -12.064518928527832, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -24.39167022705078, "logits_per_token": -2.4129037857055664, "logits_per_char": -0.8043012619018555, "num_chars": 15}, {"sum_logits": -10.543253898620605, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -26.029376983642578, "logits_per_token": -2.108650779724121, "logits_per_char": -0.3904908851340965, "num_chars": 27}, {"sum_logits": -12.295473098754883, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -24.012065887451172, "logits_per_token": -2.4590946197509767, "logits_per_char": -0.7684670686721802, "num_chars": 16}, {"sum_logits": -11.051347732543945, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -25.71006965637207, "logits_per_token": -2.210269546508789, "logits_per_char": -0.39469099044799805, "num_chars": 28}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 622, "native_id": "Mercury_SC_401142", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.889692306518555, "incorrect_loss_raw": 13.872084299723307, "correct_loss_per_char": 0.6838224851168119, "incorrect_loss_per_char": 1.1149922870454334, "correct_loss_per_token": 2.963230768839518, "incorrect_loss_per_token": 4.624028099907769, "correct_loss_uncond": -11.784685134887695, "incorrect_loss_uncond": -9.722782135009766}, "model_output": [{"sum_logits": -14.380489349365234, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -19.686817169189453, "logits_per_token": -4.793496449788411, "logits_per_char": -1.4380489349365235, "num_chars": 10}, {"sum_logits": -8.889692306518555, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -20.67437744140625, "logits_per_token": -2.963230768839518, "logits_per_char": -0.6838224851168119, "num_chars": 13}, {"sum_logits": -8.081588745117188, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -25.83626937866211, "logits_per_token": -2.6938629150390625, "logits_per_char": -0.5387725830078125, "num_chars": 15}, {"sum_logits": -19.1541748046875, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -25.261512756347656, "logits_per_token": -6.384724934895833, "logits_per_char": -1.3681553431919642, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 623, "native_id": "Mercury_7206395", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.081863403320312, "incorrect_loss_raw": 19.919716517130535, "correct_loss_per_char": 0.5021543000873766, "incorrect_loss_per_char": 0.4890288845310366, "correct_loss_per_token": 2.120207044813368, "incorrect_loss_per_token": 2.752482539131528, "correct_loss_uncond": -16.90807342529297, "incorrect_loss_uncond": -8.80621083577474}, "model_output": [{"sum_logits": -21.14027214050293, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -31.16503143310547, "logits_per_token": -3.0200388772147044, "logits_per_char": -0.5156163936708031, "num_chars": 41}, {"sum_logits": -22.96274757385254, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -31.379737854003906, "logits_per_token": -3.280392510550363, "logits_per_char": -0.5600670139964034, "num_chars": 41}, {"sum_logits": -19.081863403320312, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -35.98993682861328, "logits_per_token": -2.120207044813368, "logits_per_char": -0.5021543000873766, "num_chars": 38}, {"sum_logits": -15.656129837036133, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -23.633012771606445, "logits_per_token": -1.9570162296295166, "logits_per_char": -0.3914032459259033, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 624, "native_id": "Mercury_179025", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.471250057220459, "incorrect_loss_raw": 3.2849956353505454, "correct_loss_per_char": 0.3856944508022732, "incorrect_loss_per_char": 0.33816853099399147, "correct_loss_per_token": 1.1570833524068196, "incorrect_loss_per_token": 1.240033057000902, "correct_loss_uncond": -13.38024377822876, "incorrect_loss_uncond": -14.641964991887411}, "model_output": [{"sum_logits": -2.610621213912964, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -17.545774459838867, "logits_per_token": -1.305310606956482, "logits_per_char": -0.2900690237681071, "num_chars": 9}, {"sum_logits": -3.471250057220459, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.85149383544922, "logits_per_token": -1.1570833524068196, "logits_per_char": -0.3856944508022732, "num_chars": 9}, {"sum_logits": -2.289152145385742, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": true, "sum_logits_uncond": -17.512928009033203, "logits_per_token": -0.7630507151285807, "logits_per_char": -0.22891521453857422, "num_chars": 10}, {"sum_logits": -4.95521354675293, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -18.722179412841797, "logits_per_token": -1.6517378489176433, "logits_per_char": -0.49552135467529296, "num_chars": 10}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 625, "native_id": "Mercury_7130620", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.0695929527282715, "incorrect_loss_raw": 5.477577209472656, "correct_loss_per_char": 1.0139185905456543, "incorrect_loss_per_char": 0.8297548984724378, "correct_loss_per_token": 5.0695929527282715, "incorrect_loss_per_token": 5.477577209472656, "correct_loss_uncond": -7.541531085968018, "incorrect_loss_uncond": -8.246772766113281}, "model_output": [{"sum_logits": -3.6291794776916504, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -13.627038955688477, "logits_per_token": -3.6291794776916504, "logits_per_char": -0.4536474347114563, "num_chars": 8}, {"sum_logits": -5.0695929527282715, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -12.611124038696289, "logits_per_token": -5.0695929527282715, "logits_per_char": -1.0139185905456543, "num_chars": 5}, {"sum_logits": -8.674612045288086, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -13.58199691772461, "logits_per_token": -8.674612045288086, "logits_per_char": -1.445768674214681, "num_chars": 6}, {"sum_logits": -4.128940105438232, "num_tokens": 1, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -13.964014053344727, "logits_per_token": -4.128940105438232, "logits_per_char": -0.5898485864911761, "num_chars": 7}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 626, "native_id": "Mercury_177870", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.441387176513672, "incorrect_loss_raw": 15.074588139851889, "correct_loss_per_char": 0.5200295143939079, "incorrect_loss_per_char": 0.36851637936919307, "correct_loss_per_token": 2.715709686279297, "incorrect_loss_per_token": 1.884323517481486, "correct_loss_uncond": -17.336589813232422, "incorrect_loss_uncond": -14.511537551879883}, "model_output": [{"sum_logits": -21.321109771728516, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.50244140625, "logits_per_token": -2.6651387214660645, "logits_per_char": -0.5076454707554409, "num_chars": 42}, {"sum_logits": -11.922216415405273, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.669960021972656, "logits_per_token": -1.4902770519256592, "logits_per_char": -0.3056978568052634, "num_chars": 39}, {"sum_logits": -24.441387176513672, "num_tokens": 9, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -41.777976989746094, "logits_per_token": -2.715709686279297, "logits_per_char": -0.5200295143939079, "num_chars": 47}, {"sum_logits": -11.980438232421875, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.585975646972656, "logits_per_token": -1.4975547790527344, "logits_per_char": -0.292205810546875, "num_chars": 41}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 627, "native_id": "Mercury_7282083", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.56317138671875, "incorrect_loss_raw": 20.063461939493816, "correct_loss_per_char": 0.6636226878446692, "incorrect_loss_per_char": 0.5787739475740156, "correct_loss_per_token": 2.8203964233398438, "incorrect_loss_per_token": 3.006103144751655, "correct_loss_uncond": -17.047245025634766, "incorrect_loss_uncond": -11.771743138631185}, "model_output": [{"sum_logits": -24.32211685180664, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -40.357879638671875, "logits_per_token": -3.04026460647583, "logits_per_char": -0.6949176243373326, "num_chars": 35}, {"sum_logits": -22.56317138671875, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.610416412353516, "logits_per_token": -2.8203964233398438, "logits_per_char": -0.6636226878446692, "num_chars": 34}, {"sum_logits": -17.845111846923828, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.476051330566406, "logits_per_token": -2.974185307820638, "logits_per_char": -0.5407609650582978, "num_chars": 33}, {"sum_logits": -18.023157119750977, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.67168426513672, "logits_per_token": -3.003859519958496, "logits_per_char": -0.500643253326416, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 628, "native_id": "Mercury_SC_400233", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 1.8272157907485962, "incorrect_loss_raw": 4.873316208521525, "correct_loss_per_char": 0.10748328180874095, "incorrect_loss_per_char": 0.32304559065625554, "correct_loss_per_token": 0.6090719302495321, "incorrect_loss_per_token": 1.4144415855407715, "correct_loss_uncond": -15.043130278587341, "incorrect_loss_uncond": -15.861250162124634}, "model_output": [{"sum_logits": -2.7540180683135986, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -17.44610595703125, "logits_per_token": -0.9180060227711996, "logits_per_char": -0.21184754371643066, "num_chars": 13}, {"sum_logits": -7.559897422790527, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -25.774463653564453, "logits_per_token": -1.8899743556976318, "logits_per_char": -0.5039931615193685, "num_chars": 15}, {"sum_logits": -1.8272157907485962, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": true, "sum_logits_uncond": -16.870346069335938, "logits_per_token": -0.6090719302495321, "logits_per_char": -0.10748328180874095, "num_chars": 17}, {"sum_logits": -4.306033134460449, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -18.983129501342773, "logits_per_token": -1.4353443781534831, "logits_per_char": -0.2532960667329676, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 629, "native_id": "Mercury_7082443", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.333049774169922, "incorrect_loss_raw": 12.930718421936035, "correct_loss_per_char": 0.5982833275428185, "incorrect_loss_per_char": 0.31588906486697105, "correct_loss_per_token": 2.9166312217712402, "incorrect_loss_per_token": 1.350247178254304, "correct_loss_uncond": -7.968908309936523, "incorrect_loss_uncond": -10.086328188578287}, "model_output": [{"sum_logits": -6.78857421875, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.492069244384766, "logits_per_token": -0.7542860243055556, "logits_per_char": -0.15787381904069767, "num_chars": 43}, {"sum_logits": -23.333049774169922, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.301958084106445, "logits_per_token": -2.9166312217712402, "logits_per_char": -0.5982833275428185, "num_chars": 39}, {"sum_logits": -23.354814529418945, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.68431091308594, "logits_per_token": -2.3354814529418944, "logits_per_char": -0.5838703632354736, "num_chars": 40}, {"sum_logits": -8.64876651763916, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -18.874759674072266, "logits_per_token": -0.9609740575154623, "logits_per_char": -0.20592301232474192, "num_chars": 42}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 630, "native_id": "NCEOGA_2013_8_15", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.294506072998047, "incorrect_loss_raw": 12.77379035949707, "correct_loss_per_char": 0.3981161798749651, "incorrect_loss_per_char": 0.2503943031621734, "correct_loss_per_token": 2.786813259124756, "incorrect_loss_per_token": 1.6690078462873188, "correct_loss_uncond": -25.472270965576172, "incorrect_loss_uncond": -22.64680290222168}, "model_output": [{"sum_logits": -13.626949310302734, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.91337203979492, "logits_per_token": -1.7033686637878418, "logits_per_char": -0.2620567175058218, "num_chars": 52}, {"sum_logits": -22.294506072998047, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -47.76677703857422, "logits_per_token": -2.786813259124756, "logits_per_char": -0.3981161798749651, "num_chars": 56}, {"sum_logits": -12.143720626831055, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -32.626136779785156, "logits_per_token": -1.7348172324044364, "logits_per_char": -0.2381121691535501, "num_chars": 51}, {"sum_logits": -12.550701141357422, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -38.72227096557617, "logits_per_token": -1.5688376426696777, "logits_per_char": -0.25101402282714846, "num_chars": 50}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 631, "native_id": "Mercury_7210140", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.532840728759766, "incorrect_loss_raw": 29.684516270955402, "correct_loss_per_char": 0.5319341818491617, "incorrect_loss_per_char": 0.5771756170449996, "correct_loss_per_token": 2.8369823031955295, "incorrect_loss_per_token": 3.051330778333876, "correct_loss_uncond": -19.773632049560547, "incorrect_loss_uncond": -16.995038986206055}, "model_output": [{"sum_logits": -25.532840728759766, "num_tokens": 9, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -45.30647277832031, "logits_per_token": -2.8369823031955295, "logits_per_char": -0.5319341818491617, "num_chars": 48}, {"sum_logits": -22.377370834350586, "num_tokens": 9, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -43.68121337890625, "logits_per_token": -2.486374537150065, "logits_per_char": -0.4661952257156372, "num_chars": 48}, {"sum_logits": -20.09270477294922, "num_tokens": 10, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -41.6713981628418, "logits_per_token": -2.0092704772949217, "logits_per_char": -0.3863981687105619, "num_chars": 52}, {"sum_logits": -46.583473205566406, "num_tokens": 10, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -54.68605422973633, "logits_per_token": -4.658347320556641, "logits_per_char": -0.8789334567088001, "num_chars": 53}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 632, "native_id": "Mercury_7106593", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.68210220336914, "incorrect_loss_raw": 21.44642957051595, "correct_loss_per_char": 0.5558771292368571, "incorrect_loss_per_char": 0.49731682765155644, "correct_loss_per_token": 2.42564565485174, "incorrect_loss_per_token": 2.3999835216637813, "correct_loss_uncond": -18.51926040649414, "incorrect_loss_uncond": -18.725261688232422}, "model_output": [{"sum_logits": -16.75720977783203, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -29.783933639526367, "logits_per_token": -2.094651222229004, "logits_per_char": -0.4928591111127068, "num_chars": 34}, {"sum_logits": -24.712175369262695, "num_tokens": 11, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -51.0032958984375, "logits_per_token": -2.2465613972056997, "logits_per_char": -0.44128884587969097, "num_chars": 56}, {"sum_logits": -22.869903564453125, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.72784423828125, "logits_per_token": -2.8587379455566406, "logits_per_char": -0.5578025259622713, "num_chars": 41}, {"sum_logits": -26.68210220336914, "num_tokens": 11, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -45.20136260986328, "logits_per_token": -2.42564565485174, "logits_per_char": -0.5558771292368571, "num_chars": 48}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 633, "native_id": "Mercury_416536", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.358264923095703, "incorrect_loss_raw": 8.398685932159424, "correct_loss_per_char": 0.7263774871826172, "incorrect_loss_per_char": 1.2255802035331727, "correct_loss_per_token": 4.358264923095703, "incorrect_loss_per_token": 5.370972712834676, "correct_loss_uncond": -8.395122528076172, "incorrect_loss_uncond": -6.293668905893962}, "model_output": [{"sum_logits": -7.029778480529785, "num_tokens": 1, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -12.21864128112793, "logits_per_token": -7.029778480529785, "logits_per_char": -1.405955696105957, "num_chars": 5}, {"sum_logits": -4.358264923095703, "num_tokens": 1, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -12.753387451171875, "logits_per_token": -4.358264923095703, "logits_per_char": -0.7263774871826172, "num_chars": 6}, {"sum_logits": -7.904757022857666, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -16.360816955566406, "logits_per_token": -3.952378511428833, "logits_per_char": -0.9880946278572083, "num_chars": 8}, {"sum_logits": -10.26152229309082, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -15.49760627746582, "logits_per_token": -5.13076114654541, "logits_per_char": -1.2826902866363525, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 634, "native_id": "Mercury_410026", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.403072357177734, "incorrect_loss_raw": 11.093616803487143, "correct_loss_per_char": 0.38195783441716974, "incorrect_loss_per_char": 0.5720592797404588, "correct_loss_per_token": 4.201536178588867, "incorrect_loss_per_token": 5.546808401743571, "correct_loss_uncond": -12.285903930664062, "incorrect_loss_uncond": -6.77762508392334}, "model_output": [{"sum_logits": -11.0743989944458, "num_tokens": 2, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -16.164831161499023, "logits_per_token": -5.5371994972229, "logits_per_char": -0.6152443885803223, "num_chars": 18}, {"sum_logits": -9.063380241394043, "num_tokens": 2, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -16.605758666992188, "logits_per_token": -4.5316901206970215, "logits_per_char": -0.5035211245218912, "num_chars": 18}, {"sum_logits": -13.143071174621582, "num_tokens": 2, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -20.843135833740234, "logits_per_token": -6.571535587310791, "logits_per_char": -0.5974123261191628, "num_chars": 22}, {"sum_logits": -8.403072357177734, "num_tokens": 2, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -20.688976287841797, "logits_per_token": -4.201536178588867, "logits_per_char": -0.38195783441716974, "num_chars": 22}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 635, "native_id": "ACTAAP_2011_5_1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.589113235473633, "incorrect_loss_raw": 17.603979110717773, "correct_loss_per_char": 0.4041111572929051, "incorrect_loss_per_char": 0.3354317278371071, "correct_loss_per_token": 2.065457026163737, "incorrect_loss_per_token": 1.6450209762110852, "correct_loss_uncond": -21.302541732788086, "incorrect_loss_uncond": -19.80618731180827}, "model_output": [{"sum_logits": -14.737548828125, "num_tokens": 10, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -32.2464599609375, "logits_per_token": -1.4737548828125, "logits_per_char": -0.320381496263587, "num_chars": 46}, {"sum_logits": -18.589113235473633, "num_tokens": 9, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -39.89165496826172, "logits_per_token": -2.065457026163737, "logits_per_char": -0.4041111572929051, "num_chars": 46}, {"sum_logits": -18.522789001464844, "num_tokens": 11, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -41.46638488769531, "logits_per_token": -1.6838899092240767, "logits_per_char": -0.33677798184481533, "num_chars": 55}, {"sum_logits": -19.551599502563477, "num_tokens": 11, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -38.51765441894531, "logits_per_token": -1.7774181365966797, "logits_per_char": -0.3491357054029192, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 636, "native_id": "Mercury_417138", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.46202850341797, "incorrect_loss_raw": 19.243778864542644, "correct_loss_per_char": 0.3846255938212077, "incorrect_loss_per_char": 0.40082443890042696, "correct_loss_per_token": 1.6783662275834517, "incorrect_loss_per_token": 1.7494344422311494, "correct_loss_uncond": -17.210289001464844, "incorrect_loss_uncond": -17.521506627400715}, "model_output": [{"sum_logits": -18.997102737426758, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -36.83496856689453, "logits_per_token": -1.7270093397660689, "logits_per_char": -0.4041936752643991, "num_chars": 47}, {"sum_logits": -18.46202850341797, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -35.67231750488281, "logits_per_token": -1.6783662275834517, "logits_per_char": -0.3846255938212077, "num_chars": 48}, {"sum_logits": -18.31049156188965, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -36.315589904785156, "logits_per_token": -1.664590141989968, "logits_per_char": -0.38146857420603436, "num_chars": 48}, {"sum_logits": -20.423742294311523, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -37.14529800415039, "logits_per_token": -1.8567038449374111, "logits_per_char": -0.41681106723084743, "num_chars": 49}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 637, "native_id": "Mercury_7138915", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.844552993774414, "incorrect_loss_raw": 13.040639877319336, "correct_loss_per_char": 0.18697505161680025, "incorrect_loss_per_char": 0.22483861857447132, "correct_loss_per_token": 0.9858684539794922, "incorrect_loss_per_token": 1.1855127161199397, "correct_loss_uncond": -34.964468002319336, "incorrect_loss_uncond": -34.78144137064616}, "model_output": [{"sum_logits": -10.392295837402344, "num_tokens": 11, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -45.12914276123047, "logits_per_token": -0.9447541670365767, "logits_per_char": -0.17917751443797145, "num_chars": 58}, {"sum_logits": -14.199832916259766, "num_tokens": 11, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -49.05124282836914, "logits_per_token": -1.2908939014781604, "logits_per_char": -0.2448247054527546, "num_chars": 58}, {"sum_logits": -14.529790878295898, "num_tokens": 11, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -49.285858154296875, "logits_per_token": -1.3208900798450818, "logits_per_char": -0.25051363583268793, "num_chars": 58}, {"sum_logits": -10.844552993774414, "num_tokens": 11, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -45.80902099609375, "logits_per_token": -0.9858684539794922, "logits_per_char": -0.18697505161680025, "num_chars": 58}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 638, "native_id": "NYSEDREGENTS_2008_4_11", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.464125156402588, "incorrect_loss_raw": 6.245134671529134, "correct_loss_per_char": 1.4928250312805176, "incorrect_loss_per_char": 1.0076277483077278, "correct_loss_per_token": 7.464125156402588, "incorrect_loss_per_token": 6.245134671529134, "correct_loss_uncond": -5.146998882293701, "incorrect_loss_uncond": -7.534385999043782}, "model_output": [{"sum_logits": -6.16469144821167, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.58199691772461, "logits_per_token": -6.16469144821167, "logits_per_char": -1.027448574701945, "num_chars": 6}, {"sum_logits": -4.186731815338135, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.177938461303711, "logits_per_token": -4.186731815338135, "logits_per_char": -0.598104545048305, "num_chars": 7}, {"sum_logits": -7.464125156402588, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.611124038696289, "logits_per_token": -7.464125156402588, "logits_per_char": -1.4928250312805176, "num_chars": 5}, {"sum_logits": -8.383980751037598, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.57862663269043, "logits_per_token": -8.383980751037598, "logits_per_char": -1.3973301251729329, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 639, "native_id": "Mercury_404435", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.758214950561523, "incorrect_loss_raw": 20.62381426493327, "correct_loss_per_char": 1.1034244088565601, "incorrect_loss_per_char": 1.543672978377142, "correct_loss_per_token": 2.3447768688201904, "incorrect_loss_per_token": 3.264639445713588, "correct_loss_uncond": -3.2404441833496094, "incorrect_loss_uncond": -5.104114532470703}, "model_output": [{"sum_logits": -21.097925186157227, "num_tokens": 7, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -26.476778030395508, "logits_per_token": -3.0139893123081754, "logits_per_char": -1.241054422715131, "num_chars": 17}, {"sum_logits": -16.714963912963867, "num_tokens": 5, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -21.475170135498047, "logits_per_token": -3.3429927825927734, "logits_per_char": -1.6714963912963867, "num_chars": 10}, {"sum_logits": -24.05855369567871, "num_tokens": 7, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.23183822631836, "logits_per_token": -3.436936242239816, "logits_per_char": -1.718468121119908, "num_chars": 14}, {"sum_logits": -18.758214950561523, "num_tokens": 8, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -21.998659133911133, "logits_per_token": -2.3447768688201904, "logits_per_char": -1.1034244088565601, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 640, "native_id": "MDSA_2009_5_25", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.203651428222656, "incorrect_loss_raw": 22.012194951375324, "correct_loss_per_char": 0.29046097846880353, "incorrect_loss_per_char": 0.3738304920685597, "correct_loss_per_token": 1.766970952351888, "incorrect_loss_per_token": 1.9898206861610088, "correct_loss_uncond": -20.958274841308594, "incorrect_loss_uncond": -21.462575912475586}, "model_output": [{"sum_logits": -16.065927505493164, "num_tokens": 8, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -35.02341842651367, "logits_per_token": -2.0082409381866455, "logits_per_char": -0.36513471603393555, "num_chars": 44}, {"sum_logits": -21.203651428222656, "num_tokens": 12, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -42.16192626953125, "logits_per_token": -1.766970952351888, "logits_per_char": -0.29046097846880353, "num_chars": 73}, {"sum_logits": -18.3026065826416, "num_tokens": 12, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -43.801483154296875, "logits_per_token": -1.5252172152201335, "logits_per_char": -0.26915597915649414, "num_chars": 68}, {"sum_logits": -31.66805076599121, "num_tokens": 13, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -51.59941101074219, "logits_per_token": -2.436003905076247, "logits_per_char": -0.4872007810152494, "num_chars": 65}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 641, "native_id": "OHAT_2007_8_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.444934844970703, "incorrect_loss_raw": 23.48118527730306, "correct_loss_per_char": 0.5452310429062954, "incorrect_loss_per_char": 0.5239044617364408, "correct_loss_per_token": 3.3492764064243863, "incorrect_loss_per_token": 3.469504430558947, "correct_loss_uncond": -20.142967224121094, "incorrect_loss_uncond": -13.498093287150065}, "model_output": [{"sum_logits": -15.304647445678711, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -31.22287940979004, "logits_per_token": -3.8261618614196777, "logits_per_char": -0.4637771953235973, "num_chars": 33}, {"sum_logits": -16.02690887451172, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -30.725126266479492, "logits_per_token": -2.6711514790852866, "logits_per_char": -0.42176075985557154, "num_chars": 38}, {"sum_logits": -23.444934844970703, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -43.5879020690918, "logits_per_token": -3.3492764064243863, "logits_per_char": -0.5452310429062954, "num_chars": 43}, {"sum_logits": -39.11199951171875, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -48.989830017089844, "logits_per_token": -3.911199951171875, "logits_per_char": -0.6861754300301535, "num_chars": 57}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 642, "native_id": "Mercury_LBS10302", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.927069664001465, "incorrect_loss_raw": 7.390352725982666, "correct_loss_per_char": 0.35193354742867605, "incorrect_loss_per_char": 0.585666736448654, "correct_loss_per_token": 1.2317674160003662, "incorrect_loss_per_token": 2.1627570390701294, "correct_loss_uncond": -14.227019309997559, "incorrect_loss_uncond": -10.77314837773641}, "model_output": [{"sum_logits": -6.305562496185303, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.60904884338379, "logits_per_token": -1.5763906240463257, "logits_per_char": -0.5732329541986639, "num_chars": 11}, {"sum_logits": -6.80289363861084, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -16.018409729003906, "logits_per_token": -3.40144681930542, "logits_per_char": -0.680289363861084, "num_chars": 10}, {"sum_logits": -4.927069664001465, "num_tokens": 4, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -19.154088973999023, "logits_per_token": -1.2317674160003662, "logits_per_char": -0.35193354742867605, "num_chars": 14}, {"sum_logits": -9.062602043151855, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -21.86304473876953, "logits_per_token": -1.5104336738586426, "logits_per_char": -0.5034778912862142, "num_chars": 18}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 643, "native_id": "Mercury_7027248", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.657644271850586, "incorrect_loss_raw": 13.70082918802897, "correct_loss_per_char": 0.5551259177071708, "incorrect_loss_per_char": 0.7698667848231587, "correct_loss_per_token": 5.828822135925293, "incorrect_loss_per_token": 5.907077365451389, "correct_loss_uncond": -5.85896110534668, "incorrect_loss_uncond": -4.933085759480794}, "model_output": [{"sum_logits": -11.039997100830078, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -18.106124877929688, "logits_per_token": -5.519998550415039, "logits_per_char": -0.8492305462176983, "num_chars": 13}, {"sum_logits": -11.657644271850586, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -17.516605377197266, "logits_per_token": -5.828822135925293, "logits_per_char": -0.5551259177071708, "num_chars": 21}, {"sum_logits": -13.082420349121094, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -16.428049087524414, "logits_per_token": -6.541210174560547, "logits_per_char": -0.6885484394274259, "num_chars": 19}, {"sum_logits": -16.980070114135742, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -21.367570877075195, "logits_per_token": -5.660023371378581, "logits_per_char": -0.7718213688243519, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 644, "native_id": "Mercury_SC_401360", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.839731216430664, "incorrect_loss_raw": 11.892422676086426, "correct_loss_per_char": 0.4126174324437192, "incorrect_loss_per_char": 0.6795963773540422, "correct_loss_per_token": 2.613243738810221, "incorrect_loss_per_token": 3.9641408920288086, "correct_loss_uncond": -12.960119247436523, "incorrect_loss_uncond": -9.128158887227377}, "model_output": [{"sum_logits": -7.839731216430664, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -20.799850463867188, "logits_per_token": -2.613243738810221, "logits_per_char": -0.4126174324437192, "num_chars": 19}, {"sum_logits": -9.669601440429688, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -22.22542953491211, "logits_per_token": -3.223200480143229, "logits_per_char": -0.5089263916015625, "num_chars": 19}, {"sum_logits": -11.945210456848145, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -19.831417083740234, "logits_per_token": -3.9817368189493814, "logits_per_char": -0.7026594386381262, "num_chars": 17}, {"sum_logits": -14.062456130981445, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -21.004898071289062, "logits_per_token": -4.687485376993815, "logits_per_char": -0.827203301822438, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 645, "native_id": "ACTAAP_2013_5_17", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 25.968151092529297, "incorrect_loss_raw": 44.981980641682945, "correct_loss_per_char": 0.45558159811454907, "incorrect_loss_per_char": 0.7255158168013378, "correct_loss_per_token": 2.5968151092529297, "incorrect_loss_per_token": 3.749327091255573, "correct_loss_uncond": -9.673625946044922, "incorrect_loss_uncond": -11.715237935384115}, "model_output": [{"sum_logits": -25.968151092529297, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.64177703857422, "logits_per_token": -2.5968151092529297, "logits_per_char": -0.45558159811454907, "num_chars": 57}, {"sum_logits": -47.60662841796875, "num_tokens": 13, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -57.78865051269531, "logits_per_token": -3.66204833984375, "logits_per_char": -0.7678488454511089, "num_chars": 62}, {"sum_logits": -40.61069869995117, "num_tokens": 11, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -54.199005126953125, "logits_per_token": -3.691881699995561, "logits_per_char": -0.6550112693540512, "num_chars": 62}, {"sum_logits": -46.728614807128906, "num_tokens": 12, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -58.104000091552734, "logits_per_token": -3.8940512339274087, "logits_per_char": -0.7536873355988534, "num_chars": 62}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 646, "native_id": "Mercury_407125", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.120071411132812, "incorrect_loss_raw": 27.48151397705078, "correct_loss_per_char": 0.6024014282226563, "incorrect_loss_per_char": 0.6120709109600478, "correct_loss_per_token": 2.5100059509277344, "incorrect_loss_per_token": 2.9125131751551776, "correct_loss_uncond": -7.521999359130859, "incorrect_loss_uncond": -8.248036702473959}, "model_output": [{"sum_logits": -27.728885650634766, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -33.68971252441406, "logits_per_token": -2.5208077864213423, "logits_per_char": -0.6302019466053356, "num_chars": 44}, {"sum_logits": -29.806644439697266, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -39.87586975097656, "logits_per_token": -3.725830554962158, "logits_per_char": -0.7269913277974943, "num_chars": 41}, {"sum_logits": -30.120071411132812, "num_tokens": 12, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -37.64207077026367, "logits_per_token": -2.5100059509277344, "logits_per_char": -0.6024014282226563, "num_chars": 50}, {"sum_logits": -24.909011840820312, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.623069763183594, "logits_per_token": -2.4909011840820314, "logits_per_char": -0.4790194584773137, "num_chars": 52}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 647, "native_id": "Mercury_404820", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.345249652862549, "incorrect_loss_raw": 4.973928054173787, "correct_loss_per_char": 2.448416550954183, "incorrect_loss_per_char": 1.6579760180579293, "correct_loss_per_token": 7.345249652862549, "incorrect_loss_per_token": 4.973928054173787, "correct_loss_uncond": -1.8862557411193848, "incorrect_loss_uncond": -3.6495629151662192}, "model_output": [{"sum_logits": -3.588628053665161, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -8.302886962890625, "logits_per_token": -3.588628053665161, "logits_per_char": -1.1962093512217205, "num_chars": 3}, {"sum_logits": -5.069441795349121, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -8.570595741271973, "logits_per_token": -5.069441795349121, "logits_per_char": -1.6898139317830403, "num_chars": 3}, {"sum_logits": -6.26371431350708, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -8.996990203857422, "logits_per_token": -6.26371431350708, "logits_per_char": -2.087904771169027, "num_chars": 3}, {"sum_logits": -7.345249652862549, "num_tokens": 1, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -9.231505393981934, "logits_per_token": -7.345249652862549, "logits_per_char": -2.448416550954183, "num_chars": 3}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 648, "native_id": "Mercury_SC_416168", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.652675151824951, "incorrect_loss_raw": 10.198218822479248, "correct_loss_per_char": 0.8502972390916612, "incorrect_loss_per_char": 1.6261505895190769, "correct_loss_per_token": 7.652675151824951, "incorrect_loss_per_token": 8.277492841084799, "correct_loss_uncond": -6.499256610870361, "incorrect_loss_uncond": -4.2652866045633955}, "model_output": [{"sum_logits": -11.284584999084473, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -14.786333084106445, "logits_per_token": -11.284584999084473, "logits_per_char": -1.8807641665140789, "num_chars": 6}, {"sum_logits": -7.652675151824951, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -14.151931762695312, "logits_per_token": -7.652675151824951, "logits_per_char": -0.8502972390916612, "num_chars": 9}, {"sum_logits": -11.5243558883667, "num_tokens": 2, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -16.082500457763672, "logits_per_token": -5.76217794418335, "logits_per_char": -1.4405444860458374, "num_chars": 8}, {"sum_logits": -7.785715579986572, "num_tokens": 1, "num_tokens_all": 173, "is_greedy": false, "sum_logits_uncond": -12.521682739257812, "logits_per_token": -7.785715579986572, "logits_per_char": -1.5571431159973144, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 649, "native_id": "TIMSS_1995_8_K18", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.600852966308594, "incorrect_loss_raw": 21.859357198079426, "correct_loss_per_char": 0.37729211287064984, "incorrect_loss_per_char": 0.5421891642478486, "correct_loss_per_token": 2.3715504237583707, "incorrect_loss_per_token": 3.2807463085840616, "correct_loss_uncond": -17.79034423828125, "incorrect_loss_uncond": -11.73690414428711}, "model_output": [{"sum_logits": -16.600852966308594, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -34.391197204589844, "logits_per_token": -2.3715504237583707, "logits_per_char": -0.37729211287064984, "num_chars": 44}, {"sum_logits": -28.36899185180664, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -39.206703186035156, "logits_per_token": -4.052713121686663, "logits_per_char": -0.6167172141697096, "num_chars": 46}, {"sum_logits": -19.90560531616211, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -35.06666946411133, "logits_per_token": -3.317600886027018, "logits_per_char": -0.48550256868688074, "num_chars": 41}, {"sum_logits": -17.30347442626953, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.515411376953125, "logits_per_token": -2.4719249180385043, "logits_per_char": -0.5243477098869554, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 650, "native_id": "Mercury_SC_405130", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.793153762817383, "incorrect_loss_raw": 13.386150995890299, "correct_loss_per_char": 0.24138140678405762, "incorrect_loss_per_char": 0.535446039835612, "correct_loss_per_token": 1.1586307525634765, "incorrect_loss_per_token": 2.5121948772006566, "correct_loss_uncond": -22.814159393310547, "incorrect_loss_uncond": -20.368290583292644}, "model_output": [{"sum_logits": -5.793153762817383, "num_tokens": 5, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -28.60731315612793, "logits_per_token": -1.1586307525634765, "logits_per_char": -0.24138140678405762, "num_chars": 24}, {"sum_logits": -14.853178977966309, "num_tokens": 6, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -34.653289794921875, "logits_per_token": -2.4755298296610513, "logits_per_char": -0.5941271591186523, "num_chars": 25}, {"sum_logits": -13.139898300170898, "num_tokens": 5, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -31.320236206054688, "logits_per_token": -2.6279796600341796, "logits_per_char": -0.525595932006836, "num_chars": 25}, {"sum_logits": -12.165375709533691, "num_tokens": 5, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -35.289798736572266, "logits_per_token": -2.4330751419067385, "logits_per_char": -0.4866150283813477, "num_chars": 25}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 651, "native_id": "Mercury_SC_408631", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.712434768676758, "incorrect_loss_raw": 25.129018783569336, "correct_loss_per_char": 0.48324329295056934, "incorrect_loss_per_char": 0.5684514431061783, "correct_loss_per_token": 2.2712434768676757, "incorrect_loss_per_token": 2.755516101695873, "correct_loss_uncond": -17.482404708862305, "incorrect_loss_uncond": -12.386172612508139}, "model_output": [{"sum_logits": -22.89339256286621, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -35.42502212524414, "logits_per_token": -2.5437102847629123, "logits_per_char": -0.5723348140716553, "num_chars": 40}, {"sum_logits": -18.938865661621094, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -30.31072998046875, "logits_per_token": -2.3673582077026367, "logits_per_char": -0.4619235527224657, "num_chars": 41}, {"sum_logits": -22.712434768676758, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -40.19483947753906, "logits_per_token": -2.2712434768676757, "logits_per_char": -0.48324329295056934, "num_chars": 47}, {"sum_logits": -33.5547981262207, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -46.80982208251953, "logits_per_token": -3.3554798126220704, "logits_per_char": -0.6710959625244141, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 652, "native_id": "Mercury_SC_408763", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.9533538818359375, "incorrect_loss_raw": 9.191900889078775, "correct_loss_per_char": 0.6117964524489182, "incorrect_loss_per_char": 0.6868708330826817, "correct_loss_per_token": 2.651117960611979, "incorrect_loss_per_token": 4.595950444539388, "correct_loss_uncond": -15.238096237182617, "incorrect_loss_uncond": -11.423198064168295}, "model_output": [{"sum_logits": -10.793876647949219, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -20.768632888793945, "logits_per_token": -5.396938323974609, "logits_per_char": -0.8302982036884015, "num_chars": 13}, {"sum_logits": -7.5781145095825195, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -20.719501495361328, "logits_per_token": -3.7890572547912598, "logits_per_char": -0.6889195008711382, "num_chars": 11}, {"sum_logits": -7.9533538818359375, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -23.191450119018555, "logits_per_token": -2.651117960611979, "logits_per_char": -0.6117964524489182, "num_chars": 13}, {"sum_logits": -9.20371150970459, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -20.357162475585938, "logits_per_token": -4.601855754852295, "logits_per_char": -0.5413947946885053, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 653, "native_id": "MCAS_8_2015_18", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.54766845703125, "incorrect_loss_raw": 13.482743899027506, "correct_loss_per_char": 0.5017654984085648, "incorrect_loss_per_char": 0.4466033175957644, "correct_loss_per_token": 2.70953369140625, "incorrect_loss_per_token": 2.6965487798055015, "correct_loss_uncond": -11.258699417114258, "incorrect_loss_uncond": -12.838686307271322}, "model_output": [{"sum_logits": -13.54766845703125, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.806367874145508, "logits_per_token": -2.70953369140625, "logits_per_char": -0.5017654984085648, "num_chars": 27}, {"sum_logits": -13.707090377807617, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.09458351135254, "logits_per_token": -2.7414180755615236, "logits_per_char": -0.4895389420645578, "num_chars": 28}, {"sum_logits": -9.554064750671387, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.415109634399414, "logits_per_token": -1.9108129501342774, "logits_per_char": -0.3294505086438409, "num_chars": 29}, {"sum_logits": -17.187076568603516, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.45459747314453, "logits_per_token": -3.4374153137207033, "logits_per_char": -0.5208205020788944, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 654, "native_id": "Mercury_411729", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.447998046875, "incorrect_loss_raw": 7.886025428771973, "correct_loss_per_char": 0.8589089133522727, "incorrect_loss_per_char": 0.6469688126535126, "correct_loss_per_token": 1.889599609375, "incorrect_loss_per_token": 1.7214615980784098, "correct_loss_uncond": -10.968059539794922, "incorrect_loss_uncond": -11.852239926656088}, "model_output": [{"sum_logits": -8.655390739440918, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -20.10828971862793, "logits_per_token": -2.1638476848602295, "logits_per_char": -0.786853703585538, "num_chars": 11}, {"sum_logits": -9.447998046875, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -20.416057586669922, "logits_per_token": -1.889599609375, "logits_per_char": -0.8589089133522727, "num_chars": 11}, {"sum_logits": -7.196974754333496, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -18.901086807250977, "logits_per_token": -1.4393949508666992, "logits_per_char": -0.5536134426410382, "num_chars": 13}, {"sum_logits": -7.805710792541504, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -20.205419540405273, "logits_per_token": -1.5611421585083007, "logits_per_char": -0.6004392917339618, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 655, "native_id": "MDSA_2012_8_6", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.732215881347656, "incorrect_loss_raw": 5.177625338236491, "correct_loss_per_char": 0.8110179901123047, "incorrect_loss_per_char": 0.45847616725497775, "correct_loss_per_token": 3.2440719604492188, "incorrect_loss_per_token": 1.9666895866394043, "correct_loss_uncond": -6.806735992431641, "incorrect_loss_uncond": -10.115462303161621}, "model_output": [{"sum_logits": -4.861330032348633, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -15.95531177520752, "logits_per_token": -1.620443344116211, "logits_per_char": -0.4861330032348633, "num_chars": 10}, {"sum_logits": -6.336885452270508, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -15.89914608001709, "logits_per_token": -2.112295150756836, "logits_per_char": -0.528073787689209, "num_chars": 12}, {"sum_logits": -9.732215881347656, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -16.538951873779297, "logits_per_token": -3.2440719604492188, "logits_per_char": -0.8110179901123047, "num_chars": 12}, {"sum_logits": -4.334660530090332, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -14.024805068969727, "logits_per_token": -2.167330265045166, "logits_per_char": -0.361221710840861, "num_chars": 12}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 656, "native_id": "MCAS_1999_8_5", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.6112470626831055, "incorrect_loss_raw": 14.194736162821451, "correct_loss_per_char": 0.36729150348239475, "incorrect_loss_per_char": 0.5909392451894456, "correct_loss_per_token": 1.322249412536621, "incorrect_loss_per_token": 2.522306940290663, "correct_loss_uncond": -15.007267951965332, "incorrect_loss_uncond": -12.407387097676596}, "model_output": [{"sum_logits": -14.551358222961426, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -25.467050552368164, "logits_per_token": -2.425226370493571, "logits_per_char": -0.6326677488244098, "num_chars": 23}, {"sum_logits": -6.6112470626831055, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -21.618515014648438, "logits_per_token": -1.322249412536621, "logits_per_char": -0.36729150348239475, "num_chars": 18}, {"sum_logits": -14.08658218383789, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -27.971683502197266, "logits_per_token": -2.817316436767578, "logits_per_char": -0.7043291091918945, "num_chars": 20}, {"sum_logits": -13.946268081665039, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.36763572692871, "logits_per_token": -2.32437801361084, "logits_per_char": -0.43582087755203247, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 657, "native_id": "WASL_2004_8_17", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.565868377685547, "incorrect_loss_raw": 23.83578109741211, "correct_loss_per_char": 0.6723382531142816, "incorrect_loss_per_char": 0.6266502681688859, "correct_loss_per_token": 3.4457335472106934, "incorrect_loss_per_token": 3.3220324622260198, "correct_loss_uncond": -9.443843841552734, "incorrect_loss_uncond": -7.911547978719075}, "model_output": [{"sum_logits": -20.91356658935547, "num_tokens": 6, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -28.85733413696289, "logits_per_token": -3.4855944315592446, "logits_per_char": -0.5975304739815849, "num_chars": 35}, {"sum_logits": -21.316879272460938, "num_tokens": 6, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -30.66227912902832, "logits_per_token": -3.5528132120768228, "logits_per_char": -0.6459660385594224, "num_chars": 33}, {"sum_logits": -29.276897430419922, "num_tokens": 10, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -35.722373962402344, "logits_per_token": -2.9276897430419924, "logits_per_char": -0.6364542919656505, "num_chars": 46}, {"sum_logits": -27.565868377685547, "num_tokens": 8, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -37.00971221923828, "logits_per_token": -3.4457335472106934, "logits_per_char": -0.6723382531142816, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 658, "native_id": "Mercury_414365", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.048758506774902, "incorrect_loss_raw": 10.735732396443685, "correct_loss_per_char": 0.3526691488317541, "incorrect_loss_per_char": 0.2764207183070608, "correct_loss_per_token": 1.8641083581107003, "incorrect_loss_per_token": 1.510073732446741, "correct_loss_uncond": -19.758248329162598, "incorrect_loss_uncond": -21.483692169189453}, "model_output": [{"sum_logits": -8.396363258361816, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -29.586532592773438, "logits_per_token": -1.399393876393636, "logits_per_char": -0.27085042768909084, "num_chars": 31}, {"sum_logits": -8.733223915100098, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -29.3997859954834, "logits_per_token": -1.4555373191833496, "logits_per_char": -0.25685952691470876, "num_chars": 34}, {"sum_logits": -13.048758506774902, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.8070068359375, "logits_per_token": -1.8641083581107003, "logits_per_char": -0.3526691488317541, "num_chars": 37}, {"sum_logits": -15.07761001586914, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.67195510864258, "logits_per_token": -1.675290001763238, "logits_per_char": -0.30155220031738283, "num_chars": 50}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 659, "native_id": "Mercury_SC_415406", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.01094388961792, "incorrect_loss_raw": 9.913761138916016, "correct_loss_per_char": 0.3338544709341867, "incorrect_loss_per_char": 0.5117438257786265, "correct_loss_per_token": 1.1684906482696533, "incorrect_loss_per_token": 1.6593783121260384, "correct_loss_uncond": -25.454765796661377, "incorrect_loss_uncond": -20.012012481689453}, "model_output": [{"sum_logits": -8.807421684265137, "num_tokens": 5, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -28.84050178527832, "logits_per_token": -1.7614843368530273, "logits_per_char": -0.4635485096981651, "num_chars": 19}, {"sum_logits": -7.01094388961792, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.4657096862793, "logits_per_token": -1.1684906482696533, "logits_per_char": -0.3338544709341867, "num_chars": 21}, {"sum_logits": -11.43770694732666, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -29.568159103393555, "logits_per_token": -1.63395813533238, "logits_per_char": -0.571885347366333, "num_chars": 20}, {"sum_logits": -9.49615478515625, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -31.36865997314453, "logits_per_token": -1.5826924641927083, "logits_per_char": -0.4997976202713816, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 660, "native_id": "MCAS_2000_8_29", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 24.45992088317871, "incorrect_loss_raw": 34.21949450174967, "correct_loss_per_char": 0.2547908425331116, "incorrect_loss_per_char": 0.3564530677265591, "correct_loss_per_token": 1.3588844935099285, "incorrect_loss_per_token": 1.901083027874982, "correct_loss_uncond": -13.536592483520508, "incorrect_loss_uncond": -13.30399258931478}, "model_output": [{"sum_logits": -31.99962043762207, "num_tokens": 18, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -44.30856704711914, "logits_per_token": -1.777756690979004, "logits_per_char": -0.33332937955856323, "num_chars": 96}, {"sum_logits": -24.45992088317871, "num_tokens": 18, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -37.99651336669922, "logits_per_token": -1.3588844935099285, "logits_per_char": -0.2547908425331116, "num_chars": 96}, {"sum_logits": -31.14657211303711, "num_tokens": 18, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -46.60211181640625, "logits_per_token": -1.7303651173909504, "logits_per_char": -0.3244434595108032, "num_chars": 96}, {"sum_logits": -39.512290954589844, "num_tokens": 18, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -51.65978240966797, "logits_per_token": -2.1951272752549915, "logits_per_char": -0.41158636411031085, "num_chars": 96}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 661, "native_id": "Mercury_416230", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.112000465393066, "incorrect_loss_raw": 12.834486325581869, "correct_loss_per_char": 0.2151489460721929, "incorrect_loss_per_char": 0.27307417714003973, "correct_loss_per_token": 0.8426667054494222, "incorrect_loss_per_token": 1.0695405271318223, "correct_loss_uncond": -17.344477653503418, "incorrect_loss_uncond": -16.880794843037922}, "model_output": [{"sum_logits": -13.168304443359375, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -28.64220428466797, "logits_per_token": -1.0973587036132812, "logits_per_char": -0.280176690284242, "num_chars": 47}, {"sum_logits": -14.534993171691895, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.31825256347656, "logits_per_token": -1.2112494309743245, "logits_per_char": -0.309255173865785, "num_chars": 47}, {"sum_logits": -10.112000465393066, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -27.456478118896484, "logits_per_token": -0.8426667054494222, "logits_per_char": -0.2151489460721929, "num_chars": 47}, {"sum_logits": -10.800161361694336, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -28.185386657714844, "logits_per_token": -0.9000134468078613, "logits_per_char": -0.22979066727009226, "num_chars": 47}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 662, "native_id": "Mercury_7001295", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 9.897710800170898, "incorrect_loss_raw": 17.08781337738037, "correct_loss_per_char": 0.2537874564146384, "incorrect_loss_per_char": 0.4411808325801408, "correct_loss_per_token": 1.4139586857386999, "incorrect_loss_per_token": 2.576935851384723, "correct_loss_uncond": -20.882429122924805, "incorrect_loss_uncond": -20.403628985087078}, "model_output": [{"sum_logits": -11.811946868896484, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.61469268798828, "logits_per_token": -1.4764933586120605, "logits_per_char": -0.3028704325358073, "num_chars": 39}, {"sum_logits": -25.97223663330078, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -45.028358459472656, "logits_per_token": -4.32870610555013, "logits_per_char": -0.6659547854692508, "num_chars": 39}, {"sum_logits": -9.897710800170898, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.780139923095703, "logits_per_token": -1.4139586857386999, "logits_per_char": -0.2537874564146384, "num_chars": 39}, {"sum_logits": -13.479256629943848, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.831275939941406, "logits_per_token": -1.9256080899919783, "logits_per_char": -0.35471727973536443, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 663, "native_id": "MSA_2012_5_2", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.17038631439209, "incorrect_loss_raw": 4.584361394246419, "correct_loss_per_char": 0.32079894726093, "incorrect_loss_per_char": 0.4545186720197163, "correct_loss_per_token": 4.17038631439209, "incorrect_loss_per_token": 4.584361394246419, "correct_loss_uncond": -11.650994300842285, "incorrect_loss_uncond": -9.960552215576172}, "model_output": [{"sum_logits": -4.17038631439209, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -4.17038631439209, "logits_per_char": -0.32079894726093, "num_chars": 13}, {"sum_logits": -6.511943817138672, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.18421745300293, "logits_per_token": -6.511943817138672, "logits_per_char": -0.813992977142334, "num_chars": 8}, {"sum_logits": -2.7164530754089355, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -2.7164530754089355, "logits_per_char": -0.2263710896174113, "num_chars": 12}, {"sum_logits": -4.52468729019165, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -14.193122863769531, "logits_per_token": -4.52468729019165, "logits_per_char": -0.3231919492994036, "num_chars": 14}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 664, "native_id": "MCAS_2005_8_7", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.240188598632812, "incorrect_loss_raw": 26.142091115315754, "correct_loss_per_char": 0.3375471683970669, "incorrect_loss_per_char": 0.5127886814058636, "correct_loss_per_token": 1.7491080544211648, "incorrect_loss_per_token": 2.3824892288599258, "correct_loss_uncond": -16.086204528808594, "incorrect_loss_uncond": -10.245712280273438}, "model_output": [{"sum_logits": -27.28089141845703, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -39.49253845214844, "logits_per_token": -2.728089141845703, "logits_per_char": -0.5930628569229789, "num_chars": 46}, {"sum_logits": -21.02179718017578, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.79495620727539, "logits_per_token": -2.1021797180175783, "logits_per_char": -0.4671510484483507, "num_chars": 45}, {"sum_logits": -19.240188598632812, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.326393127441406, "logits_per_token": -1.7491080544211648, "logits_per_char": -0.3375471683970669, "num_chars": 57}, {"sum_logits": -30.123584747314453, "num_tokens": 13, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -39.87591552734375, "logits_per_token": -2.3171988267164965, "logits_per_char": -0.4781521388462612, "num_chars": 63}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 665, "native_id": "Mercury_7206553", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.527396202087402, "incorrect_loss_raw": 8.714221954345703, "correct_loss_per_char": 0.7369056589463178, "incorrect_loss_per_char": 0.5296112365823574, "correct_loss_per_token": 6.263698101043701, "incorrect_loss_per_token": 4.357110977172852, "correct_loss_uncond": -7.5247697830200195, "incorrect_loss_uncond": -10.540438334147135}, "model_output": [{"sum_logits": -7.369072914123535, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.352182388305664, "logits_per_token": -3.6845364570617676, "logits_per_char": -0.3684536457061768, "num_chars": 20}, {"sum_logits": -7.597224235534668, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.841815948486328, "logits_per_token": -3.798612117767334, "logits_per_char": -0.4220680130852593, "num_chars": 18}, {"sum_logits": -11.176368713378906, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.569982528686523, "logits_per_token": -5.588184356689453, "logits_per_char": -0.7983120509556362, "num_chars": 14}, {"sum_logits": -12.527396202087402, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -20.052165985107422, "logits_per_token": -6.263698101043701, "logits_per_char": -0.7369056589463178, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 666, "native_id": "VASoL_2010_3_39", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.963272094726562, "incorrect_loss_raw": 22.381264368693035, "correct_loss_per_char": 0.3893480998713796, "incorrect_loss_per_char": 0.602177753022603, "correct_loss_per_token": 2.2804674421037947, "incorrect_loss_per_token": 3.9381819195217553, "correct_loss_uncond": -22.098636627197266, "incorrect_loss_uncond": -15.051655451456705}, "model_output": [{"sum_logits": -15.963272094726562, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -38.06190872192383, "logits_per_token": -2.2804674421037947, "logits_per_char": -0.3893480998713796, "num_chars": 41}, {"sum_logits": -21.796384811401367, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -37.0767822265625, "logits_per_token": -3.632730801900228, "logits_per_char": -0.5735890739842465, "num_chars": 38}, {"sum_logits": -18.7174072265625, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -32.38064956665039, "logits_per_token": -3.7434814453125, "logits_per_char": -0.5671941583806818, "num_chars": 33}, {"sum_logits": -26.630001068115234, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -42.84132766723633, "logits_per_token": -4.438333511352539, "logits_per_char": -0.6657500267028809, "num_chars": 40}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 667, "native_id": "Mercury_416380", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.44384765625, "incorrect_loss_raw": 3.4771376848220825, "correct_loss_per_char": 0.34912109375, "incorrect_loss_per_char": 0.2645117728798478, "correct_loss_per_token": 2.44384765625, "incorrect_loss_per_token": 1.8607573575443694, "correct_loss_uncond": -9.932743072509766, "incorrect_loss_uncond": -9.637499690055847}, "model_output": [{"sum_logits": -2.44384765625, "num_tokens": 1, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -2.44384765625, "logits_per_char": -0.34912109375, "num_chars": 7}, {"sum_logits": -2.784322738647461, "num_tokens": 1, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -13.184402465820312, "logits_per_token": -2.784322738647461, "logits_per_char": -0.3093691931830512, "num_chars": 9}, {"sum_logits": -1.4935153722763062, "num_tokens": 2, "num_tokens_all": 223, "is_greedy": true, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -0.7467576861381531, "logits_per_char": -0.09956769148508708, "num_chars": 15}, {"sum_logits": -6.1535749435424805, "num_tokens": 3, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -13.905923843383789, "logits_per_token": -2.0511916478474936, "logits_per_char": -0.38459843397140503, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 668, "native_id": "OHAT_2008_5_34", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.12002944946289, "incorrect_loss_raw": 24.025753021240234, "correct_loss_per_char": 0.2108339468638102, "incorrect_loss_per_char": 0.5365372383585094, "correct_loss_per_token": 0.9200026772238992, "incorrect_loss_per_token": 2.243869492501924, "correct_loss_uncond": -13.151784896850586, "incorrect_loss_uncond": -10.014843622843424}, "model_output": [{"sum_logits": -19.704341888427734, "num_tokens": 10, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.263416290283203, "logits_per_token": -1.9704341888427734, "logits_per_char": -0.5052395356007111, "num_chars": 39}, {"sum_logits": -18.433631896972656, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.79201316833496, "logits_per_token": -1.6757847179066052, "logits_per_char": -0.438895997546968, "num_chars": 42}, {"sum_logits": -10.12002944946289, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -23.271814346313477, "logits_per_token": -0.9200026772238992, "logits_per_char": -0.2108339468638102, "num_chars": 48}, {"sum_logits": -33.93928527832031, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -40.06636047363281, "logits_per_token": -3.085389570756392, "logits_per_char": -0.6654761819278493, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 669, "native_id": "Mercury_7268328", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.467655181884766, "incorrect_loss_raw": 20.56651560465495, "correct_loss_per_char": 0.40935310363769534, "incorrect_loss_per_char": 0.4588775246231644, "correct_loss_per_token": 2.0467655181884767, "incorrect_loss_per_token": 2.410768738499394, "correct_loss_uncond": -18.048019409179688, "incorrect_loss_uncond": -16.279295603434246}, "model_output": [{"sum_logits": -17.806310653686523, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -34.71025848388672, "logits_per_token": -2.9677184422810874, "logits_per_char": -0.5237150192260742, "num_chars": 34}, {"sum_logits": -21.84545135498047, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.209434509277344, "logits_per_token": -2.4272723727756076, "logits_per_char": -0.48545447455512153, "num_chars": 45}, {"sum_logits": -20.467655181884766, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -38.51567459106445, "logits_per_token": -2.0467655181884767, "logits_per_char": -0.40935310363769534, "num_chars": 50}, {"sum_logits": -22.04778480529785, "num_tokens": 12, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -40.617740631103516, "logits_per_token": -1.8373154004414876, "logits_per_char": -0.36746308008829753, "num_chars": 60}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 670, "native_id": "NYSEDREGENTS_2008_8_36", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 29.251096725463867, "incorrect_loss_raw": 25.43875249226888, "correct_loss_per_char": 0.5625210908743051, "incorrect_loss_per_char": 0.7354732323526768, "correct_loss_per_token": 2.4375913937886557, "incorrect_loss_per_token": 2.926801063396313, "correct_loss_uncond": -13.911142349243164, "incorrect_loss_uncond": -10.194252014160156}, "model_output": [{"sum_logits": -21.95331573486328, "num_tokens": 9, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.923885345458984, "logits_per_token": -2.4392573038736978, "logits_per_char": -0.6456857569077435, "num_chars": 34}, {"sum_logits": -32.70397186279297, "num_tokens": 9, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -46.78406524658203, "logits_per_token": -3.633774651421441, "logits_per_char": -0.883891131426837, "num_chars": 37}, {"sum_logits": -29.251096725463867, "num_tokens": 12, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -43.16223907470703, "logits_per_token": -2.4375913937886557, "logits_per_char": -0.5625210908743051, "num_chars": 52}, {"sum_logits": -21.65896987915039, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.191062927246094, "logits_per_token": -2.707371234893799, "logits_per_char": -0.6768428087234497, "num_chars": 32}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 671, "native_id": "Mercury_SC_414156", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.156723022460938, "incorrect_loss_raw": 22.7273686726888, "correct_loss_per_char": 0.3614476041973762, "incorrect_loss_per_char": 0.4643734569362142, "correct_loss_per_token": 1.741520274769176, "incorrect_loss_per_token": 2.407964411648837, "correct_loss_uncond": -20.876117706298828, "incorrect_loss_uncond": -11.538154602050781}, "model_output": [{"sum_logits": -24.44275665283203, "num_tokens": 8, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -31.293785095214844, "logits_per_token": -3.055344581604004, "logits_per_char": -0.5431723700629341, "num_chars": 45}, {"sum_logits": -21.14685821533203, "num_tokens": 10, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -32.46915054321289, "logits_per_token": -2.114685821533203, "logits_per_char": -0.43156853500677617, "num_chars": 49}, {"sum_logits": -19.156723022460938, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -40.032840728759766, "logits_per_token": -1.741520274769176, "logits_per_char": -0.3614476041973762, "num_chars": 53}, {"sum_logits": -22.592491149902344, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -39.033634185791016, "logits_per_token": -2.053862831809304, "logits_per_char": -0.4183794657389323, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 672, "native_id": "Mercury_7094133", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.189667701721191, "incorrect_loss_raw": 12.067156155904135, "correct_loss_per_char": 0.31970479147774833, "incorrect_loss_per_char": 0.5053939587304506, "correct_loss_per_token": 1.864944616953532, "incorrect_loss_per_token": 3.4280824926164413, "correct_loss_uncond": -17.809117317199707, "incorrect_loss_uncond": -15.256489117940268}, "model_output": [{"sum_logits": -14.806564331054688, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.74862289428711, "logits_per_token": -4.9355214436848955, "logits_per_char": -0.7403282165527344, "num_chars": 20}, {"sum_logits": -11.715890884399414, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.01331329345703, "logits_per_token": -2.9289727210998535, "logits_per_char": -0.35502699649695196, "num_chars": 33}, {"sum_logits": -9.6790132522583, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.208999633789062, "logits_per_token": -2.419753313064575, "logits_per_char": -0.4208266631416652, "num_chars": 23}, {"sum_logits": -11.189667701721191, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -28.9987850189209, "logits_per_token": -1.864944616953532, "logits_per_char": -0.31970479147774833, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 673, "native_id": "MEA_2013_5_15", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.280332565307617, "incorrect_loss_raw": 7.459593137105306, "correct_loss_per_char": 0.9280332565307617, "incorrect_loss_per_char": 0.7234239115859523, "correct_loss_per_token": 1.3257617950439453, "incorrect_loss_per_token": 1.0656561624436152, "correct_loss_uncond": -8.31666374206543, "incorrect_loss_uncond": -11.341755231221518}, "model_output": [{"sum_logits": -6.998100280761719, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -19.357877731323242, "logits_per_token": -0.9997286115373883, "logits_per_char": -0.6998100280761719, "num_chars": 10}, {"sum_logits": -7.43668270111084, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.583316802978516, "logits_per_token": -1.0623832430158342, "logits_per_char": -0.6760620637373491, "num_chars": 11}, {"sum_logits": -9.280332565307617, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -17.596996307373047, "logits_per_token": -1.3257617950439453, "logits_per_char": -0.9280332565307617, "num_chars": 10}, {"sum_logits": -7.943996429443359, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.46285057067871, "logits_per_token": -1.1348566327776228, "logits_per_char": -0.794399642944336, "num_chars": 10}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 674, "native_id": "OHAT_2010_8_35", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.489067077636719, "incorrect_loss_raw": 10.174997488657633, "correct_loss_per_char": 0.28296890258789065, "incorrect_loss_per_char": 0.4140043852993249, "correct_loss_per_token": 1.4148445129394531, "incorrect_loss_per_token": 2.20903111828698, "correct_loss_uncond": -15.032066345214844, "incorrect_loss_uncond": -14.737022240956625}, "model_output": [{"sum_logits": -8.489067077636719, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -23.521133422851562, "logits_per_token": -1.4148445129394531, "logits_per_char": -0.28296890258789065, "num_chars": 30}, {"sum_logits": -12.049857139587402, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -24.5793514251709, "logits_per_token": -2.008309523264567, "logits_per_char": -0.4303520406995501, "num_chars": 28}, {"sum_logits": -7.423089504241943, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -23.61892318725586, "logits_per_token": -1.8557723760604858, "logits_per_char": -0.309295396010081, "num_chars": 24}, {"sum_logits": -11.052045822143555, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.537784576416016, "logits_per_token": -2.7630114555358887, "logits_per_char": -0.5023657191883434, "num_chars": 22}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 675, "native_id": "Mercury_SC_416174", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.2698869705200195, "incorrect_loss_raw": 8.182374000549316, "correct_loss_per_char": 1.0449811617533367, "incorrect_loss_per_char": 1.1243100340986427, "correct_loss_per_token": 6.2698869705200195, "incorrect_loss_per_token": 7.285542805989583, "correct_loss_uncond": -10.000170707702637, "incorrect_loss_uncond": -7.788038571675618}, "model_output": [{"sum_logits": -5.380987167358398, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.670656204223633, "logits_per_token": -2.690493583679199, "logits_per_char": -0.4139220897967999, "num_chars": 13}, {"sum_logits": -9.884607315063477, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -15.251548767089844, "logits_per_token": -9.884607315063477, "logits_per_char": -1.4120867592947823, "num_chars": 7}, {"sum_logits": -6.2698869705200195, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -16.270057678222656, "logits_per_token": -6.2698869705200195, "logits_per_char": -1.0449811617533367, "num_chars": 6}, {"sum_logits": -9.281527519226074, "num_tokens": 1, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -14.989032745361328, "logits_per_token": -9.281527519226074, "logits_per_char": -1.5469212532043457, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 676, "native_id": "TIMSS_1995_8_J6", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.5714693069458, "incorrect_loss_raw": 21.464775721232098, "correct_loss_per_char": 0.32381042904324003, "incorrect_loss_per_char": 0.41798695977167855, "correct_loss_per_token": 1.6190521452162001, "incorrect_loss_per_token": 2.0998041359377115, "correct_loss_uncond": -21.92173671722412, "incorrect_loss_uncond": -20.288175582885742}, "model_output": [{"sum_logits": -9.80323600769043, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -25.51555633544922, "logits_per_token": -1.960647201538086, "logits_per_char": -0.42622765250827954, "num_chars": 23}, {"sum_logits": -14.5714693069458, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -36.49320602416992, "logits_per_token": -1.6190521452162001, "logits_per_char": -0.32381042904324003, "num_chars": 45}, {"sum_logits": -32.0351448059082, "num_tokens": 14, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -51.507415771484375, "logits_per_token": -2.288224628993443, "logits_per_char": -0.4388376000809343, "num_chars": 73}, {"sum_logits": -22.555946350097656, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -48.23588180541992, "logits_per_token": -2.050540577281605, "logits_per_char": -0.3888956267258217, "num_chars": 58}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 677, "native_id": "Mercury_SC_401587", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.395395278930664, "incorrect_loss_raw": 6.482744852701823, "correct_loss_per_char": 0.6279136112758091, "incorrect_loss_per_char": 0.5941577834427042, "correct_loss_per_token": 4.395395278930664, "incorrect_loss_per_token": 2.448474923769633, "correct_loss_uncond": -8.222860336303711, "incorrect_loss_uncond": -10.598944028218588}, "model_output": [{"sum_logits": -4.395395278930664, "num_tokens": 1, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -12.618255615234375, "logits_per_token": -4.395395278930664, "logits_per_char": -0.6279136112758091, "num_chars": 7}, {"sum_logits": -8.135721206665039, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -18.196617126464844, "logits_per_token": -4.0678606033325195, "logits_per_char": -0.9039690229627821, "num_chars": 9}, {"sum_logits": -5.919283390045166, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.740965843200684, "logits_per_token": -1.4798208475112915, "logits_per_char": -0.49327361583709717, "num_chars": 12}, {"sum_logits": -5.393229961395264, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -17.307483673095703, "logits_per_token": -1.797743320465088, "logits_per_char": -0.3852307115282331, "num_chars": 14}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 678, "native_id": "MDSA_2011_5_23", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.502361297607422, "incorrect_loss_raw": 24.946392059326172, "correct_loss_per_char": 0.4205082113092596, "incorrect_loss_per_char": 0.6600188216777763, "correct_loss_per_token": 1.8502361297607421, "incorrect_loss_per_token": 3.0995594229016987, "correct_loss_uncond": -17.701988220214844, "incorrect_loss_uncond": -13.89184824625651}, "model_output": [{"sum_logits": -26.321374893188477, "num_tokens": 7, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -36.90614700317383, "logits_per_token": -3.7601964133126393, "logits_per_char": -0.8773791631062825, "num_chars": 30}, {"sum_logits": -21.049732208251953, "num_tokens": 10, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.43150329589844, "logits_per_token": -2.1049732208251952, "logits_per_char": -0.47840300473299896, "num_chars": 44}, {"sum_logits": -27.468069076538086, "num_tokens": 8, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.17707061767578, "logits_per_token": -3.4335086345672607, "logits_per_char": -0.6242742971940474, "num_chars": 44}, {"sum_logits": -18.502361297607422, "num_tokens": 10, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.204349517822266, "logits_per_token": -1.8502361297607421, "logits_per_char": -0.4205082113092596, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 679, "native_id": "AIMS_2008_8_11", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.520358085632324, "incorrect_loss_raw": 18.141552289326984, "correct_loss_per_char": 0.3031673180429559, "incorrect_loss_per_char": 0.524446053986239, "correct_loss_per_token": 1.6457654408046178, "incorrect_loss_per_token": 3.0166459522550064, "correct_loss_uncond": -15.91779613494873, "incorrect_loss_uncond": -13.028000831604004}, "model_output": [{"sum_logits": -15.850122451782227, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.690372467041016, "logits_per_token": -2.6416870752970376, "logits_per_char": -0.4803067409630978, "num_chars": 33}, {"sum_logits": -15.708052635192871, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -28.16714096069336, "logits_per_token": -3.141610527038574, "logits_per_char": -0.5067113753288023, "num_chars": 31}, {"sum_logits": -11.520358085632324, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.438154220581055, "logits_per_token": -1.6457654408046178, "logits_per_char": -0.3031673180429559, "num_chars": 38}, {"sum_logits": -22.86648178100586, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -38.651145935058594, "logits_per_token": -3.2666402544294084, "logits_per_char": -0.586320045666817, "num_chars": 39}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 680, "native_id": "Mercury_7159215", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.129045486450195, "incorrect_loss_raw": 16.74617354075114, "correct_loss_per_char": 0.4557756608532321, "incorrect_loss_per_char": 0.4679532279563179, "correct_loss_per_token": 2.825809097290039, "incorrect_loss_per_token": 2.7575008418824942, "correct_loss_uncond": -19.39398765563965, "incorrect_loss_uncond": -18.617643038431805}, "model_output": [{"sum_logits": -15.655058860778809, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.790985107421875, "logits_per_token": -1.956882357597351, "logits_per_char": -0.40141176566099507, "num_chars": 39}, {"sum_logits": -18.03216552734375, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -34.216033935546875, "logits_per_token": -3.0053609212239585, "logits_per_char": -0.5008934868706597, "num_chars": 36}, {"sum_logits": -16.55129623413086, "num_tokens": 5, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -34.08443069458008, "logits_per_token": -3.310259246826172, "logits_per_char": -0.5015544313372988, "num_chars": 33}, {"sum_logits": -14.129045486450195, "num_tokens": 5, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -33.523033142089844, "logits_per_token": -2.825809097290039, "logits_per_char": -0.4557756608532321, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 681, "native_id": "MCAS_2006_9_30", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.175625801086426, "incorrect_loss_raw": 14.892294565836588, "correct_loss_per_char": 0.5653125445048014, "incorrect_loss_per_char": 0.7819606809905081, "correct_loss_per_token": 2.5439064502716064, "incorrect_loss_per_token": 4.448244624667697, "correct_loss_uncond": -12.195122718811035, "incorrect_loss_uncond": -5.114677429199219}, "model_output": [{"sum_logits": -11.400732040405273, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -18.8039493560791, "logits_per_token": -3.8002440134684243, "logits_per_char": -0.5182150927456942, "num_chars": 22}, {"sum_logits": -10.175625801086426, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.37074851989746, "logits_per_token": -2.5439064502716064, "logits_per_char": -0.5653125445048014, "num_chars": 18}, {"sum_logits": -14.705423355102539, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -18.592754364013672, "logits_per_token": -4.90180778503418, "logits_per_char": -0.7352711677551269, "num_chars": 20}, {"sum_logits": -18.570728302001953, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.62421226501465, "logits_per_token": -4.642682075500488, "logits_per_char": -1.0923957824707031, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 682, "native_id": "MCAS_1999_4_27", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.829716682434082, "incorrect_loss_raw": 18.90690008799235, "correct_loss_per_char": 0.49290486176808673, "incorrect_loss_per_char": 0.4778460100236295, "correct_loss_per_token": 2.3659433364868163, "incorrect_loss_per_token": 2.9868631135849726, "correct_loss_uncond": -16.53135585784912, "incorrect_loss_uncond": -11.687333742777506}, "model_output": [{"sum_logits": -11.829716682434082, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -28.361072540283203, "logits_per_token": -2.3659433364868163, "logits_per_char": -0.49290486176808673, "num_chars": 24}, {"sum_logits": -13.605402946472168, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.880517959594727, "logits_per_token": -2.2675671577453613, "logits_per_char": -0.33183909625541874, "num_chars": 41}, {"sum_logits": -20.700149536132812, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -34.06191635131836, "logits_per_token": -2.9571642194475447, "logits_per_char": -0.42245203134964926, "num_chars": 49}, {"sum_logits": -22.41514778137207, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.840267181396484, "logits_per_token": -3.7358579635620117, "logits_per_char": -0.6792469024658203, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 683, "native_id": "Mercury_7016538", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.1933536529541016, "incorrect_loss_raw": 7.977354367574056, "correct_loss_per_char": 0.1325948503282335, "incorrect_loss_per_char": 0.9428575428482754, "correct_loss_per_token": 0.5966768264770508, "incorrect_loss_per_token": 3.46128511428833, "correct_loss_uncond": -14.306482315063477, "incorrect_loss_uncond": -8.458529472351074}, "model_output": [{"sum_logits": -8.712579727172852, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -15.836952209472656, "logits_per_token": -4.356289863586426, "logits_per_char": -1.4520966211954753, "num_chars": 6}, {"sum_logits": -5.726426124572754, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -16.540637969970703, "logits_per_token": -2.863213062286377, "logits_per_char": -0.8180608749389648, "num_chars": 7}, {"sum_logits": -1.1933536529541016, "num_tokens": 2, "num_tokens_all": 202, "is_greedy": true, "sum_logits_uncond": -15.499835968017578, "logits_per_token": -0.5966768264770508, "logits_per_char": -0.1325948503282335, "num_chars": 9}, {"sum_logits": -9.493057250976562, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -16.93006134033203, "logits_per_token": -3.1643524169921875, "logits_per_char": -0.558415132410386, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 684, "native_id": "Mercury_SC_409266", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.340301513671875, "incorrect_loss_raw": 14.801496505737305, "correct_loss_per_char": 0.43896323756167765, "incorrect_loss_per_char": 0.582019559826444, "correct_loss_per_token": 1.668060302734375, "incorrect_loss_per_token": 2.960299301147461, "correct_loss_uncond": -14.404048919677734, "incorrect_loss_uncond": -12.035995483398438}, "model_output": [{"sum_logits": -8.340301513671875, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -22.74435043334961, "logits_per_token": -1.668060302734375, "logits_per_char": -0.43896323756167765, "num_chars": 19}, {"sum_logits": -8.360698699951172, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -22.93243408203125, "logits_per_token": -1.6721397399902345, "logits_per_char": -0.36350863912831183, "num_chars": 23}, {"sum_logits": -16.06325340270996, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -28.90212059020996, "logits_per_token": -3.212650680541992, "logits_per_char": -0.6425301361083985, "num_chars": 25}, {"sum_logits": -19.98053741455078, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -28.677921295166016, "logits_per_token": -3.996107482910156, "logits_per_char": -0.7400199042426215, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 685, "native_id": "OHAT_2007_5_15", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.058941841125488, "incorrect_loss_raw": 13.605155944824219, "correct_loss_per_char": 1.5058941841125488, "incorrect_loss_per_char": 1.1743156850079954, "correct_loss_per_token": 3.764735460281372, "incorrect_loss_per_token": 4.027957232793172, "correct_loss_uncond": -1.4791440963745117, "incorrect_loss_uncond": -2.33648681640625}, "model_output": [{"sum_logits": -16.131694793701172, "num_tokens": 5, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.66492462158203, "logits_per_token": -3.2263389587402345, "logits_per_char": -1.152263913835798, "num_chars": 14}, {"sum_logits": -15.058941841125488, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -16.5380859375, "logits_per_token": -3.764735460281372, "logits_per_char": -1.5058941841125488, "num_chars": 10}, {"sum_logits": -13.93741512298584, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -17.52802276611328, "logits_per_token": -3.48435378074646, "logits_per_char": -1.393741512298584, "num_chars": 10}, {"sum_logits": -10.746357917785645, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -12.631980895996094, "logits_per_token": -5.373178958892822, "logits_per_char": -0.976941628889604, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 686, "native_id": "Mercury_7230073", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.403021335601807, "incorrect_loss_raw": 9.271687666575113, "correct_loss_per_char": 0.45735866682870047, "incorrect_loss_per_char": 0.6621689780145629, "correct_loss_per_token": 1.6007553339004517, "incorrect_loss_per_token": 2.922556387053596, "correct_loss_uncond": -9.192588329315186, "incorrect_loss_uncond": -9.707526683807373}, "model_output": [{"sum_logits": -12.886516571044922, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -16.09038543701172, "logits_per_token": -4.295505523681641, "logits_per_char": -0.991270505464994, "num_chars": 13}, {"sum_logits": -6.403021335601807, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -15.595609664916992, "logits_per_token": -1.6007553339004517, "logits_per_char": -0.45735866682870047, "num_chars": 14}, {"sum_logits": -6.048222064971924, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -19.05238151550293, "logits_per_token": -1.512055516242981, "logits_per_char": -0.40321480433146156, "num_chars": 15}, {"sum_logits": -8.880324363708496, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -21.794876098632812, "logits_per_token": -2.9601081212361655, "logits_per_char": -0.592021624247233, "num_chars": 15}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 687, "native_id": "Mercury_7245840", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.142168998718262, "incorrect_loss_raw": 9.372156739234924, "correct_loss_per_char": 0.6190093888176812, "incorrect_loss_per_char": 0.47482991984500983, "correct_loss_per_token": 5.571084499359131, "incorrect_loss_per_token": 3.192353493637509, "correct_loss_uncond": -11.431424140930176, "incorrect_loss_uncond": -8.963613549868265}, "model_output": [{"sum_logits": -1.2294224500656128, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": true, "sum_logits_uncond": -14.031314849853516, "logits_per_token": -0.6147112250328064, "logits_per_char": -0.08196149667104086, "num_chars": 15}, {"sum_logits": -11.142168998718262, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -22.573593139648438, "logits_per_token": -5.571084499359131, "logits_per_char": -0.6190093888176812, "num_chars": 18}, {"sum_logits": -9.122519493103027, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.314483642578125, "logits_per_token": -3.0408398310343423, "logits_per_char": -0.5701574683189392, "num_chars": 16}, {"sum_logits": -17.764528274536133, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -25.66151237487793, "logits_per_token": -5.921509424845378, "logits_per_char": -0.7723707945450492, "num_chars": 23}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 688, "native_id": "Mercury_SC_401788", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.050660610198975, "incorrect_loss_raw": 11.431310653686523, "correct_loss_per_char": 0.47004404067993166, "incorrect_loss_per_char": 0.4840558219258407, "correct_loss_per_token": 2.350220203399658, "incorrect_loss_per_token": 3.5070155196719703, "correct_loss_uncond": -8.110442638397217, "incorrect_loss_uncond": -10.808952967325846}, "model_output": [{"sum_logits": -14.341800689697266, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -21.93781280517578, "logits_per_token": -4.780600229899089, "logits_per_char": -0.6829428899855841, "num_chars": 21}, {"sum_logits": -7.050660610198975, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.161103248596191, "logits_per_token": -2.350220203399658, "logits_per_char": -0.47004404067993166, "num_chars": 15}, {"sum_logits": -9.028962135314941, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -21.164819717407227, "logits_per_token": -3.0096540451049805, "logits_per_char": -0.3925635711006496, "num_chars": 23}, {"sum_logits": -10.923169136047363, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -23.6181583404541, "logits_per_token": -2.730792284011841, "logits_per_char": -0.3766610046912884, "num_chars": 29}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 689, "native_id": "ACTAAP_2014_7_5", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.799862861633301, "incorrect_loss_raw": 6.42633072535197, "correct_loss_per_char": 0.14073566154197412, "incorrect_loss_per_char": 0.2582574690712823, "correct_loss_per_token": 0.6333104769388834, "incorrect_loss_per_token": 1.0680449743119498, "correct_loss_uncond": -18.241721153259277, "incorrect_loss_uncond": -21.358770211537678}, "model_output": [{"sum_logits": -6.815432548522949, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.673480987548828, "logits_per_token": -1.135905424753825, "logits_per_char": -0.28397635618845624, "num_chars": 24}, {"sum_logits": -7.428442478179932, "num_tokens": 7, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -30.983964920043945, "logits_per_token": -1.0612060683114188, "logits_per_char": -0.2971376991271973, "num_chars": 25}, {"sum_logits": -5.035117149353027, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -24.697856903076172, "logits_per_token": -1.0070234298706056, "logits_per_char": -0.19365835189819336, "num_chars": 26}, {"sum_logits": -3.799862861633301, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -22.041584014892578, "logits_per_token": -0.6333104769388834, "logits_per_char": -0.14073566154197412, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 690, "native_id": "MCAS_2004_5_11", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.988311767578125, "incorrect_loss_raw": 20.049584070841473, "correct_loss_per_char": 0.5480100570186492, "incorrect_loss_per_char": 0.6896549139370104, "correct_loss_per_token": 2.4269016810825894, "incorrect_loss_per_token": 3.1504613634139766, "correct_loss_uncond": -17.055992126464844, "incorrect_loss_uncond": -16.857580184936523}, "model_output": [{"sum_logits": -16.36497688293457, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -37.40004348754883, "logits_per_token": -2.7274961471557617, "logits_per_char": -0.6294221878051758, "num_chars": 26}, {"sum_logits": -19.700641632080078, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -39.19573974609375, "logits_per_token": -3.283440272013346, "logits_per_char": -0.5794306362376493, "num_chars": 34}, {"sum_logits": -24.083133697509766, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -34.125709533691406, "logits_per_token": -3.440447671072824, "logits_per_char": -0.860111917768206, "num_chars": 28}, {"sum_logits": -16.988311767578125, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -34.04430389404297, "logits_per_token": -2.4269016810825894, "logits_per_char": -0.5480100570186492, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 691, "native_id": "NCEOGA_2013_8_7", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.0533447265625, "incorrect_loss_raw": 8.918764273325602, "correct_loss_per_char": 0.61407470703125, "incorrect_loss_per_char": 0.45094164195078196, "correct_loss_per_token": 3.6844482421875, "incorrect_loss_per_token": 2.621885225507948, "correct_loss_uncond": -13.690832138061523, "incorrect_loss_uncond": -15.232171853383383}, "model_output": [{"sum_logits": -11.0533447265625, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.744176864624023, "logits_per_token": -3.6844482421875, "logits_per_char": -0.61407470703125, "num_chars": 18}, {"sum_logits": -7.898314476013184, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.07478904724121, "logits_per_token": -1.5796628952026368, "logits_per_char": -0.2820826598576137, "num_chars": 28}, {"sum_logits": -6.61072301864624, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -21.21538734436035, "logits_per_token": -2.2035743395487466, "logits_per_char": -0.25425857764024, "num_chars": 26}, {"sum_logits": -12.247255325317383, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.16263198852539, "logits_per_token": -4.082418441772461, "logits_per_char": -0.8164836883544921, "num_chars": 15}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 692, "native_id": "LEAP__7_10339", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 56.708499908447266, "incorrect_loss_raw": 61.29372914632162, "correct_loss_per_char": 0.4686652884995642, "incorrect_loss_per_char": 0.5019954739703418, "correct_loss_per_token": 2.3628541628519693, "incorrect_loss_per_token": 2.5900370694588926, "correct_loss_uncond": -11.731555938720703, "incorrect_loss_uncond": -13.005226135253906}, "model_output": [{"sum_logits": -56.708499908447266, "num_tokens": 24, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -68.44005584716797, "logits_per_token": -2.3628541628519693, "logits_per_char": -0.4686652884995642, "num_chars": 121}, {"sum_logits": -68.4823226928711, "num_tokens": 24, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -77.71117401123047, "logits_per_token": -2.8534301122029624, "logits_per_char": -0.5522767959102508, "num_chars": 124}, {"sum_logits": -55.564788818359375, "num_tokens": 24, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -70.61862182617188, "logits_per_token": -2.315199534098307, "logits_per_char": -0.4592131307302428, "num_chars": 121}, {"sum_logits": -59.834075927734375, "num_tokens": 23, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -74.56707000732422, "logits_per_token": -2.6014815620754077, "logits_per_char": -0.494496495270532, "num_chars": 121}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 693, "native_id": "Mercury_7018270", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.910832405090332, "incorrect_loss_raw": 8.116309324900309, "correct_loss_per_char": 1.1737120368263938, "incorrect_loss_per_char": 0.8843714370929376, "correct_loss_per_token": 4.303610801696777, "incorrect_loss_per_token": 3.090461174647013, "correct_loss_uncond": -8.216670036315918, "incorrect_loss_uncond": -10.536564350128174}, "model_output": [{"sum_logits": -7.915746688842773, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -16.226333618164062, "logits_per_token": -3.9578733444213867, "logits_per_char": -1.319291114807129, "num_chars": 6}, {"sum_logits": -4.820859432220459, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -15.283318519592285, "logits_per_token": -2.4104297161102295, "logits_per_char": -0.6886942046029227, "num_chars": 7}, {"sum_logits": -12.910832405090332, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -21.12750244140625, "logits_per_token": -4.303610801696777, "logits_per_char": -1.1737120368263938, "num_chars": 11}, {"sum_logits": -11.612321853637695, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -24.4489688873291, "logits_per_token": -2.903080463409424, "logits_per_char": -0.6451289918687608, "num_chars": 18}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 694, "native_id": "Mercury_7034808", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.75330638885498, "incorrect_loss_raw": 15.322703043619791, "correct_loss_per_char": 0.5501322555541992, "incorrect_loss_per_char": 0.47130176135542395, "correct_loss_per_token": 2.750661277770996, "incorrect_loss_per_token": 2.7036679343571737, "correct_loss_uncond": -15.762375831604004, "incorrect_loss_uncond": -15.742653528849283}, "model_output": [{"sum_logits": -13.75330638885498, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -29.515682220458984, "logits_per_token": -2.750661277770996, "logits_per_char": -0.5501322555541992, "num_chars": 25}, {"sum_logits": -19.231216430664062, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -32.12030792236328, "logits_per_token": -3.8462432861328124, "logits_per_char": -0.620361820344002, "num_chars": 31}, {"sum_logits": -8.038307189941406, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.824541091918945, "logits_per_token": -1.148329598563058, "logits_per_char": -0.2435850663618608, "num_chars": 33}, {"sum_logits": -18.698585510253906, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.251220703125, "logits_per_token": -3.116430918375651, "logits_per_char": -0.549958397360409, "num_chars": 34}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 695, "native_id": "Mercury_7216300", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.884273529052734, "incorrect_loss_raw": 11.09819491704305, "correct_loss_per_char": 0.8801643665020282, "incorrect_loss_per_char": 0.5941316181881658, "correct_loss_per_token": 5.721068382263184, "incorrect_loss_per_token": 3.017265796661377, "correct_loss_uncond": -4.682353973388672, "incorrect_loss_uncond": -11.478934446970621}, "model_output": [{"sum_logits": -7.458707332611084, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.8535213470459, "logits_per_token": -2.486235777537028, "logits_per_char": -0.5327648094722203, "num_chars": 14}, {"sum_logits": -10.487895965576172, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -20.749418258666992, "logits_per_token": -3.495965321858724, "logits_per_char": -0.5519945245040091, "num_chars": 19}, {"sum_logits": -22.884273529052734, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -27.566627502441406, "logits_per_token": -5.721068382263184, "logits_per_char": -0.8801643665020282, "num_chars": 26}, {"sum_logits": -15.347981452941895, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.128448486328125, "logits_per_token": -3.069596290588379, "logits_per_char": -0.6976355205882679, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 696, "native_id": "Mercury_SC_400985", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 34.945068359375, "incorrect_loss_raw": 27.234669367472332, "correct_loss_per_char": 0.7435120927526596, "incorrect_loss_per_char": 0.6255967102564431, "correct_loss_per_token": 3.1768243963068183, "incorrect_loss_per_token": 2.849356748439648, "correct_loss_uncond": -7.977046966552734, "incorrect_loss_uncond": -9.753292719523111}, "model_output": [{"sum_logits": -28.72981834411621, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -36.035457611083984, "logits_per_token": -3.5912272930145264, "logits_per_char": -0.756047851160953, "num_chars": 38}, {"sum_logits": -19.52377700805664, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.06000518798828, "logits_per_token": -2.169308556450738, "logits_per_char": -0.46485183352515813, "num_chars": 42}, {"sum_logits": -34.945068359375, "num_tokens": 11, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -42.922115325927734, "logits_per_token": -3.1768243963068183, "logits_per_char": -0.7435120927526596, "num_chars": 47}, {"sum_logits": -33.45041275024414, "num_tokens": 12, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -41.86842346191406, "logits_per_token": -2.7875343958536782, "logits_per_char": -0.6558904460832184, "num_chars": 51}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 697, "native_id": "Mercury_7188528", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.168160438537598, "incorrect_loss_raw": 10.830561002095541, "correct_loss_per_char": 0.30400943756103516, "incorrect_loss_per_char": 0.5568878640706277, "correct_loss_per_token": 2.584080219268799, "incorrect_loss_per_token": 4.468378331926133, "correct_loss_uncond": -11.410197257995605, "incorrect_loss_uncond": -11.604125022888184}, "model_output": [{"sum_logits": -5.168160438537598, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -16.578357696533203, "logits_per_token": -2.584080219268799, "logits_per_char": -0.30400943756103516, "num_chars": 17}, {"sum_logits": -7.582159042358398, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -16.40916633605957, "logits_per_token": -3.791079521179199, "logits_per_char": -0.3990610022293894, "num_chars": 19}, {"sum_logits": -17.044239044189453, "num_tokens": 3, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -27.49833869934082, "logits_per_token": -5.681413014729817, "logits_per_char": -0.8970652128520765, "num_chars": 19}, {"sum_logits": -7.8652849197387695, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -23.39655303955078, "logits_per_token": -3.9326424598693848, "logits_per_char": -0.37453737713041757, "num_chars": 21}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 698, "native_id": "TIMSS_1995_8_R2", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 25.028396606445312, "incorrect_loss_raw": 23.441818237304688, "correct_loss_per_char": 0.3627303856006567, "incorrect_loss_per_char": 0.5019831140155482, "correct_loss_per_token": 1.564274787902832, "incorrect_loss_per_token": 2.081589633768255, "correct_loss_uncond": -24.729656219482422, "incorrect_loss_uncond": -12.145609537760416}, "model_output": [{"sum_logits": -30.897777557373047, "num_tokens": 16, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -43.69412612915039, "logits_per_token": -1.9311110973358154, "logits_per_char": -0.44779387764308765, "num_chars": 69}, {"sum_logits": -25.028396606445312, "num_tokens": 16, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -49.758052825927734, "logits_per_token": -1.564274787902832, "logits_per_char": -0.3627303856006567, "num_chars": 69}, {"sum_logits": -18.03418731689453, "num_tokens": 11, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -31.967647552490234, "logits_per_token": -1.6394715742631392, "logits_per_char": -0.4098678935657848, "num_chars": 44}, {"sum_logits": -21.393489837646484, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -31.100509643554688, "logits_per_token": -2.6741862297058105, "logits_per_char": -0.6482875708377722, "num_chars": 33}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 699, "native_id": "Mercury_SC_400032", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 1.88960862159729, "incorrect_loss_raw": 9.026283780733744, "correct_loss_per_char": 0.1574673851331075, "incorrect_loss_per_char": 0.654513987312969, "correct_loss_per_token": 1.88960862159729, "incorrect_loss_per_token": 9.026283780733744, "correct_loss_uncond": -12.367791891098022, "incorrect_loss_uncond": -5.8008939027786255}, "model_output": [{"sum_logits": -1.152305245399475, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": true, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -1.152305245399475, "logits_per_char": -0.08863886503072885, "num_chars": 13}, {"sum_logits": -11.64604377746582, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.986494064331055, "logits_per_token": -11.64604377746582, "logits_per_char": -0.7764029184977214, "num_chars": 15}, {"sum_logits": -1.88960862159729, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -1.88960862159729, "logits_per_char": -0.1574673851331075, "num_chars": 12}, {"sum_logits": -14.280502319335938, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.67365837097168, "logits_per_token": -14.280502319335938, "logits_per_char": -1.0985001784104567, "num_chars": 13}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 700, "native_id": "Mercury_7252245", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.591426372528076, "incorrect_loss_raw": 5.25722599029541, "correct_loss_per_char": 0.583955874809852, "incorrect_loss_per_char": 0.5626363295775193, "correct_loss_per_token": 7.591426372528076, "incorrect_loss_per_token": 4.274250030517578, "correct_loss_uncond": -7.0793328285217285, "incorrect_loss_uncond": -9.09954579671224}, "model_output": [{"sum_logits": -3.837785243988037, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.686746597290039, "logits_per_token": -3.837785243988037, "logits_per_char": -0.47972315549850464, "num_chars": 8}, {"sum_logits": -6.036036968231201, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.770700454711914, "logits_per_token": -6.036036968231201, "logits_per_char": -0.7545046210289001, "num_chars": 8}, {"sum_logits": -5.897855758666992, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.612868309020996, "logits_per_token": -2.948927879333496, "logits_per_char": -0.45368121220515323, "num_chars": 13}, {"sum_logits": -7.591426372528076, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.670759201049805, "logits_per_token": -7.591426372528076, "logits_per_char": -0.583955874809852, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 701, "native_id": "MCAS_2002_8_17", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.2186279296875, "incorrect_loss_raw": 21.25548775990804, "correct_loss_per_char": 0.33788808186848956, "incorrect_loss_per_char": 0.5797967294563993, "correct_loss_per_token": 2.0273284912109375, "incorrect_loss_per_token": 2.924278488865605, "correct_loss_uncond": -17.515384674072266, "incorrect_loss_uncond": -8.949872652689615}, "model_output": [{"sum_logits": -24.14284896850586, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.364871978759766, "logits_per_token": -3.0178561210632324, "logits_per_char": -0.5748297373453776, "num_chars": 42}, {"sum_logits": -24.40961456298828, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -31.662200927734375, "logits_per_token": -2.7121793958875866, "logits_per_char": -0.5306437948475713, "num_chars": 46}, {"sum_logits": -15.21399974822998, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -25.589008331298828, "logits_per_token": -3.042799949645996, "logits_per_char": -0.6339166561762491, "num_chars": 24}, {"sum_logits": -16.2186279296875, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.734012603759766, "logits_per_token": -2.0273284912109375, "logits_per_char": -0.33788808186848956, "num_chars": 48}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 702, "native_id": "MDSA_2007_8_30", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.503185272216797, "incorrect_loss_raw": 13.986339569091797, "correct_loss_per_char": 0.4822566168648856, "incorrect_loss_per_char": 0.5412673612585044, "correct_loss_per_token": 3.375796318054199, "incorrect_loss_per_token": 3.6664405398898654, "correct_loss_uncond": -13.372243881225586, "incorrect_loss_uncond": -10.354064305623373}, "model_output": [{"sum_logits": -12.134039878845215, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -20.855098724365234, "logits_per_token": -4.044679959615071, "logits_per_char": -0.6386336778339586, "num_chars": 19}, {"sum_logits": -11.902871131896973, "num_tokens": 3, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -20.736835479736328, "logits_per_token": -3.967623710632324, "logits_per_char": -0.4251025404248919, "num_chars": 28}, {"sum_logits": -13.503185272216797, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -26.875429153442383, "logits_per_token": -3.375796318054199, "logits_per_char": -0.4822566168648856, "num_chars": 28}, {"sum_logits": -17.922107696533203, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.429277420043945, "logits_per_token": -2.9870179494222007, "logits_per_char": -0.5600658655166626, "num_chars": 32}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 703, "native_id": "NCEOGA_2013_5_35", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.357437133789062, "incorrect_loss_raw": 18.30547587076823, "correct_loss_per_char": 0.26959474625126006, "incorrect_loss_per_char": 0.7998874521186686, "correct_loss_per_token": 1.0446796417236328, "incorrect_loss_per_token": 3.2755848778618706, "correct_loss_uncond": -15.028068542480469, "incorrect_loss_uncond": -6.498385747273763}, "model_output": [{"sum_logits": -20.220500946044922, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.01824188232422, "logits_per_token": -4.044100189208985, "logits_per_char": -0.9191136793656782, "num_chars": 22}, {"sum_logits": -18.476106643676758, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -26.24298667907715, "logits_per_token": -3.0793511072794595, "logits_per_char": -0.8798146020798456, "num_chars": 21}, {"sum_logits": -16.219820022583008, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -22.15035629272461, "logits_per_token": -2.703303337097168, "logits_per_char": -0.6007340749104818, "num_chars": 27}, {"sum_logits": -8.357437133789062, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -23.38550567626953, "logits_per_token": -1.0446796417236328, "logits_per_char": -0.26959474625126006, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 704, "native_id": "Mercury_7082758", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.875802993774414, "incorrect_loss_raw": 17.605390866597492, "correct_loss_per_char": 0.29926675738710345, "incorrect_loss_per_char": 0.6049043514110424, "correct_loss_per_token": 1.645967165629069, "incorrect_loss_per_token": 3.7648610273996987, "correct_loss_uncond": -16.340784072875977, "incorrect_loss_uncond": -9.697913805643717}, "model_output": [{"sum_logits": -20.788230895996094, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -27.978364944458008, "logits_per_token": -4.157646179199219, "logits_per_char": -0.6929410298665365, "num_chars": 30}, {"sum_logits": -9.875802993774414, "num_tokens": 6, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.21658706665039, "logits_per_token": -1.645967165629069, "logits_per_char": -0.29926675738710345, "num_chars": 33}, {"sum_logits": -14.626971244812012, "num_tokens": 4, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -25.18874740600586, "logits_per_token": -3.656742811203003, "logits_per_char": -0.5417396757337782, "num_chars": 27}, {"sum_logits": -17.400970458984375, "num_tokens": 5, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -28.742801666259766, "logits_per_token": -3.480194091796875, "logits_per_char": -0.5800323486328125, "num_chars": 30}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 705, "native_id": "Mercury_7094308", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.916894912719727, "incorrect_loss_raw": 17.31663449605306, "correct_loss_per_char": 0.43376653622358274, "incorrect_loss_per_char": 0.4469360599552403, "correct_loss_per_token": 2.4166992732456754, "incorrect_loss_per_token": 1.805503155968406, "correct_loss_uncond": -13.848442077636719, "incorrect_loss_uncond": -8.876960754394531}, "model_output": [{"sum_logits": -16.290563583374023, "num_tokens": 10, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -25.15289306640625, "logits_per_token": -1.6290563583374023, "logits_per_char": -0.49365344192042493, "num_chars": 33}, {"sum_logits": -16.007051467895508, "num_tokens": 8, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -25.152685165405273, "logits_per_token": -2.0008814334869385, "logits_per_char": -0.41043721712552583, "num_chars": 39}, {"sum_logits": -16.916894912719727, "num_tokens": 7, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -30.765336990356445, "logits_per_token": -2.4166992732456754, "logits_per_char": -0.43376653622358274, "num_chars": 39}, {"sum_logits": -19.65228843688965, "num_tokens": 11, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -28.27520751953125, "logits_per_token": -1.7865716760808772, "logits_per_char": -0.43671752081976994, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 706, "native_id": "Mercury_7136028", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.711841583251953, "incorrect_loss_raw": 7.914484818776448, "correct_loss_per_char": 0.40385660631903286, "incorrect_loss_per_char": 0.33721506182312866, "correct_loss_per_token": 2.9279603958129883, "incorrect_loss_per_token": 1.9032712618509928, "correct_loss_uncond": -16.41509246826172, "incorrect_loss_uncond": -16.01899067560832}, "model_output": [{"sum_logits": -11.711841583251953, "num_tokens": 4, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -28.126934051513672, "logits_per_token": -2.9279603958129883, "logits_per_char": -0.40385660631903286, "num_chars": 29}, {"sum_logits": -9.526461601257324, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -24.26117706298828, "logits_per_token": -1.9052923202514649, "logits_per_char": -0.34023077147347586, "num_chars": 28}, {"sum_logits": -7.008571147918701, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -24.33212661743164, "logits_per_token": -1.4017142295837401, "logits_per_char": -0.29202379782994586, "num_chars": 24}, {"sum_logits": -7.20842170715332, "num_tokens": 3, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -23.207122802734375, "logits_per_token": -2.4028072357177734, "logits_per_char": -0.3793906161659642, "num_chars": 19}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 707, "native_id": "Mercury_7159075", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.797658920288086, "incorrect_loss_raw": 10.013941764831543, "correct_loss_per_char": 0.46655518668038504, "incorrect_loss_per_char": 0.5051219051514242, "correct_loss_per_token": 4.898829460144043, "incorrect_loss_per_token": 4.535900645785861, "correct_loss_uncond": -10.891904830932617, "incorrect_loss_uncond": -9.466986656188965}, "model_output": [{"sum_logits": -9.83431625366211, "num_tokens": 2, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -19.737829208374023, "logits_per_token": -4.917158126831055, "logits_per_char": -0.5784891913918888, "num_chars": 17}, {"sum_logits": -9.797658920288086, "num_tokens": 2, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -20.689563751220703, "logits_per_token": -4.898829460144043, "logits_per_char": -0.46655518668038504, "num_chars": 21}, {"sum_logits": -8.479264259338379, "num_tokens": 3, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -17.820594787597656, "logits_per_token": -2.8264214197794595, "logits_per_char": -0.4037744885399228, "num_chars": 21}, {"sum_logits": -11.72824478149414, "num_tokens": 2, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -20.884361267089844, "logits_per_token": -5.86412239074707, "logits_per_char": -0.5331020355224609, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 708, "native_id": "MCAS_2015_5_19", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.691961288452148, "incorrect_loss_raw": 11.197139422098795, "correct_loss_per_char": 0.7901782989501953, "incorrect_loss_per_char": 0.9697327860585458, "correct_loss_per_token": 4.345980644226074, "incorrect_loss_per_token": 5.03016832139757, "correct_loss_uncond": -8.314804077148438, "incorrect_loss_uncond": -6.969370206197103}, "model_output": [{"sum_logits": -15.46706771850586, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -22.243633270263672, "logits_per_token": -7.73353385925293, "logits_per_char": -1.189774439885066, "num_chars": 13}, {"sum_logits": -8.691961288452148, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -17.006765365600586, "logits_per_token": -4.345980644226074, "logits_per_char": -0.7901782989501953, "num_chars": 11}, {"sum_logits": -7.893125534057617, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -15.379566192626953, "logits_per_token": -3.9465627670288086, "logits_per_char": -0.7893125534057617, "num_chars": 10}, {"sum_logits": -10.23122501373291, "num_tokens": 3, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -16.87632942199707, "logits_per_token": -3.41040833791097, "logits_per_char": -0.93011136488481, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 709, "native_id": "MSA_2012_5_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.478482246398926, "incorrect_loss_raw": 16.155891100565594, "correct_loss_per_char": 0.351054898014775, "incorrect_loss_per_char": 0.42027670093891456, "correct_loss_per_token": 1.5797470410664876, "incorrect_loss_per_token": 2.0345004399617515, "correct_loss_uncond": -10.105786323547363, "incorrect_loss_uncond": -9.26497999827067}, "model_output": [{"sum_logits": -9.478482246398926, "num_tokens": 6, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -19.58426856994629, "logits_per_token": -1.5797470410664876, "logits_per_char": -0.351054898014775, "num_chars": 27}, {"sum_logits": -10.051209449768066, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -17.90290641784668, "logits_per_token": -1.435887064252581, "logits_per_char": -0.3242325628957441, "num_chars": 31}, {"sum_logits": -12.572225570678711, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -24.28839874267578, "logits_per_token": -1.796032224382673, "logits_per_char": -0.34922848807440865, "num_chars": 36}, {"sum_logits": -25.84423828125, "num_tokens": 9, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -34.07130813598633, "logits_per_token": -2.87158203125, "logits_per_char": -0.5873690518465909, "num_chars": 44}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 710, "native_id": "MCAS_2014_5_13", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.10323715209961, "incorrect_loss_raw": 24.80102475484212, "correct_loss_per_char": 0.4188174406687419, "incorrect_loss_per_char": 0.5741802186676951, "correct_loss_per_token": 2.2336930168999567, "incorrect_loss_per_token": 2.6649349989714444, "correct_loss_uncond": -22.869571685791016, "incorrect_loss_uncond": -19.27594820658366}, "model_output": [{"sum_logits": -19.33121109008789, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -38.427337646484375, "logits_per_token": -2.147912343343099, "logits_per_char": -0.48328027725219724, "num_chars": 40}, {"sum_logits": -24.498292922973633, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -43.50719451904297, "logits_per_token": -2.4498292922973635, "logits_per_char": -0.544406509399414, "num_chars": 45}, {"sum_logits": -30.573570251464844, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -50.29638671875, "logits_per_token": -3.3970633612738714, "logits_per_char": -0.6948538693514738, "num_chars": 44}, {"sum_logits": -20.10323715209961, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -42.972808837890625, "logits_per_token": -2.2336930168999567, "logits_per_char": -0.4188174406687419, "num_chars": 48}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 711, "native_id": "Mercury_SC_400392", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 38.43963623046875, "incorrect_loss_raw": 26.37958017985026, "correct_loss_per_char": 1.0677676730685763, "incorrect_loss_per_char": 0.7628500766675416, "correct_loss_per_token": 4.804954528808594, "incorrect_loss_per_token": 3.59152474857512, "correct_loss_uncond": -4.537971496582031, "incorrect_loss_uncond": -6.579800923665364}, "model_output": [{"sum_logits": -28.82939910888672, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -33.24629592895508, "logits_per_token": -4.118485586983817, "logits_per_char": -0.90091872215271, "num_chars": 32}, {"sum_logits": -20.57557487487793, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.082843780517578, "logits_per_token": -2.9393678392682756, "logits_per_char": -0.6051639669081744, "num_chars": 34}, {"sum_logits": -38.43963623046875, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -42.97760772705078, "logits_per_token": -4.804954528808594, "logits_per_char": -1.0677676730685763, "num_chars": 36}, {"sum_logits": -29.733766555786133, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.54900360107422, "logits_per_token": -3.7167208194732666, "logits_per_char": -0.7824675409417403, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 712, "native_id": "Mercury_7159320", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.786170959472656, "incorrect_loss_raw": 9.334099451700846, "correct_loss_per_char": 0.799135684967041, "incorrect_loss_per_char": 0.5548183034967493, "correct_loss_per_token": 4.262056986490886, "incorrect_loss_per_token": 3.9815398322211366, "correct_loss_uncond": -11.076833724975586, "incorrect_loss_uncond": -11.58149528503418}, "model_output": [{"sum_logits": -12.786170959472656, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.863004684448242, "logits_per_token": -4.262056986490886, "logits_per_char": -0.799135684967041, "num_chars": 16}, {"sum_logits": -4.781633377075195, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -19.02826690673828, "logits_per_token": -2.3908166885375977, "logits_per_char": -0.2988520860671997, "num_chars": 16}, {"sum_logits": -10.881486892700195, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -20.048511505126953, "logits_per_token": -5.440743446350098, "logits_per_char": -0.6800929307937622, "num_chars": 16}, {"sum_logits": -12.339178085327148, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.670005798339844, "logits_per_token": -4.113059361775716, "logits_per_char": -0.685509893629286, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 713, "native_id": "Mercury_7218365", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.5377588272094727, "incorrect_loss_raw": 3.0509671370188394, "correct_loss_per_char": 0.2196798324584961, "incorrect_loss_per_char": 0.43830899638358994, "correct_loss_per_token": 1.5377588272094727, "incorrect_loss_per_token": 3.0509671370188394, "correct_loss_uncond": -13.8490571975708, "incorrect_loss_uncond": -11.2846200466156}, "model_output": [{"sum_logits": -2.465989589691162, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -12.625795364379883, "logits_per_token": -2.465989589691162, "logits_per_char": -0.4931979179382324, "num_chars": 5}, {"sum_logits": -1.5377588272094727, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": true, "sum_logits_uncond": -15.386816024780273, "logits_per_token": -1.5377588272094727, "logits_per_char": -0.2196798324584961, "num_chars": 7}, {"sum_logits": -2.8225295543670654, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -13.745674133300781, "logits_per_token": -2.8225295543670654, "logits_per_char": -0.4704215923945109, "num_chars": 6}, {"sum_logits": -3.864382266998291, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -16.635292053222656, "logits_per_token": -3.864382266998291, "logits_per_char": -0.35130747881802643, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 714, "native_id": "MCAS_2004_9_10-v1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.699636936187744, "incorrect_loss_raw": 12.433757305145264, "correct_loss_per_char": 2.2332123120625815, "incorrect_loss_per_char": 2.6078418777102517, "correct_loss_per_token": 6.699636936187744, "incorrect_loss_per_token": 8.399804592132568, "correct_loss_uncond": -3.1726126670837402, "incorrect_loss_uncond": -1.5441702206929524}, "model_output": [{"sum_logits": -5.922712802886963, "num_tokens": 1, "num_tokens_all": 260, "is_greedy": false, "sum_logits_uncond": -8.377655029296875, "logits_per_token": -5.922712802886963, "logits_per_char": -1.974237600962321, "num_chars": 3}, {"sum_logits": -7.174842834472656, "num_tokens": 1, "num_tokens_all": 260, "is_greedy": false, "sum_logits_uncond": -9.133262634277344, "logits_per_token": -7.174842834472656, "logits_per_char": -2.3916142781575522, "num_chars": 3}, {"sum_logits": -6.699636936187744, "num_tokens": 1, "num_tokens_all": 260, "is_greedy": false, "sum_logits_uncond": -9.872249603271484, "logits_per_token": -6.699636936187744, "logits_per_char": -2.2332123120625815, "num_chars": 3}, {"sum_logits": -24.203716278076172, "num_tokens": 2, "num_tokens_all": 261, "is_greedy": false, "sum_logits_uncond": -24.42286491394043, "logits_per_token": -12.101858139038086, "logits_per_char": -3.4576737540108815, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 715, "native_id": "AIMS_2009_4_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.484710693359375, "incorrect_loss_raw": 19.822923342386883, "correct_loss_per_char": 0.35938088935718204, "incorrect_loss_per_char": 0.4881133142143789, "correct_loss_per_token": 1.8622464266690342, "incorrect_loss_per_token": 2.1813675059212576, "correct_loss_uncond": -19.25072479248047, "incorrect_loss_uncond": -14.684926668802897}, "model_output": [{"sum_logits": -15.287348747253418, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -29.603708267211914, "logits_per_token": -1.9109185934066772, "logits_per_char": -0.4632529923410127, "num_chars": 33}, {"sum_logits": -19.353761672973633, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.796886444091797, "logits_per_token": -2.150417963663737, "logits_per_char": -0.4838440418243408, "num_chars": 40}, {"sum_logits": -24.827659606933594, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -42.122955322265625, "logits_per_token": -2.482765960693359, "logits_per_char": -0.5172429084777832, "num_chars": 48}, {"sum_logits": -20.484710693359375, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -39.735435485839844, "logits_per_token": -1.8622464266690342, "logits_per_char": -0.35938088935718204, "num_chars": 57}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 716, "native_id": "Mercury_SC_414274", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.1051025390625, "incorrect_loss_raw": 24.877750396728516, "correct_loss_per_char": 0.3616692958733974, "incorrect_loss_per_char": 0.6200086138069928, "correct_loss_per_token": 1.5672336154513888, "incorrect_loss_per_token": 2.636414307135123, "correct_loss_uncond": -11.368627548217773, "incorrect_loss_uncond": -9.256771087646484}, "model_output": [{"sum_logits": -23.74213218688965, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -30.71407699584961, "logits_per_token": -2.967766523361206, "logits_per_char": -0.7658752318351499, "num_chars": 31}, {"sum_logits": -14.1051025390625, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.473730087280273, "logits_per_token": -1.5672336154513888, "logits_per_char": -0.3616692958733974, "num_chars": 39}, {"sum_logits": -25.21979331970215, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -35.96354293823242, "logits_per_token": -2.8021992577446833, "logits_per_char": -0.6004712695167178, "num_chars": 42}, {"sum_logits": -25.67132568359375, "num_tokens": 12, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.72594451904297, "logits_per_token": -2.139277140299479, "logits_per_char": -0.4936793400691106, "num_chars": 52}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 717, "native_id": "MCAS_2005_9_6", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.535670757293701, "incorrect_loss_raw": 7.703032811482747, "correct_loss_per_char": 0.9336672510419574, "incorrect_loss_per_char": 1.1583052211337619, "correct_loss_per_token": 3.2678353786468506, "incorrect_loss_per_token": 3.421407381693522, "correct_loss_uncond": -11.535824298858643, "incorrect_loss_uncond": -10.308791478474935}, "model_output": [{"sum_logits": -7.291867256164551, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.409976959228516, "logits_per_token": -3.6459336280822754, "logits_per_char": -1.2153112093607585, "num_chars": 6}, {"sum_logits": -8.075268745422363, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.87670135498047, "logits_per_token": -4.037634372711182, "logits_per_char": -1.1536098207746233, "num_chars": 7}, {"sum_logits": -6.535670757293701, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -18.071495056152344, "logits_per_token": -3.2678353786468506, "logits_per_char": -0.9336672510419574, "num_chars": 7}, {"sum_logits": -7.741962432861328, "num_tokens": 3, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -18.748794555664062, "logits_per_token": -2.5806541442871094, "logits_per_char": -1.105994633265904, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 718, "native_id": "MCAS_1998_4_23", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.77857780456543, "incorrect_loss_raw": 19.920228322347004, "correct_loss_per_char": 0.7891943026811649, "incorrect_loss_per_char": 0.5172213109334309, "correct_loss_per_token": 3.4198419782850475, "incorrect_loss_per_token": 2.397155216761998, "correct_loss_uncond": -9.153684616088867, "incorrect_loss_uncond": -9.47689692179362}, "model_output": [{"sum_logits": -30.77857780456543, "num_tokens": 9, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -39.9322624206543, "logits_per_token": -3.4198419782850475, "logits_per_char": -0.7891943026811649, "num_chars": 39}, {"sum_logits": -25.906169891357422, "num_tokens": 11, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -33.84318161010742, "logits_per_token": -2.3551063537597656, "logits_per_char": -0.5181233978271484, "num_chars": 50}, {"sum_logits": -16.764720916748047, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -29.773204803466797, "logits_per_token": -2.3949601309640065, "logits_per_char": -0.5588240305582682, "num_chars": 30}, {"sum_logits": -17.089794158935547, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -24.574989318847656, "logits_per_token": -2.441399165562221, "logits_per_char": -0.4747165044148763, "num_chars": 36}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 719, "native_id": "Mercury_7075023", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 28.251689910888672, "incorrect_loss_raw": 22.96227518717448, "correct_loss_per_char": 0.6420838616111062, "incorrect_loss_per_char": 0.5745296257692059, "correct_loss_per_token": 4.035955701555524, "incorrect_loss_per_token": 3.2598200071425665, "correct_loss_uncond": -5.549968719482422, "incorrect_loss_uncond": -4.801217397054036}, "model_output": [{"sum_logits": -24.544477462768555, "num_tokens": 7, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -29.1664981842041, "logits_per_token": -3.5063539232526506, "logits_per_char": -0.5986457917748428, "num_chars": 41}, {"sum_logits": -17.527502059936523, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -22.25421905517578, "logits_per_token": -2.921250343322754, "logits_per_char": -0.5155147664687213, "num_chars": 34}, {"sum_logits": -26.81484603881836, "num_tokens": 8, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -31.869760513305664, "logits_per_token": -3.351855754852295, "logits_per_char": -0.6094283190640536, "num_chars": 44}, {"sum_logits": -28.251689910888672, "num_tokens": 7, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -33.801658630371094, "logits_per_token": -4.035955701555524, "logits_per_char": -0.6420838616111062, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 720, "native_id": "Mercury_SC_400182", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.134476661682129, "incorrect_loss_raw": 7.59617551167806, "correct_loss_per_char": 1.3557461102803547, "incorrect_loss_per_char": 0.8086061911149458, "correct_loss_per_token": 8.134476661682129, "incorrect_loss_per_token": 7.59617551167806, "correct_loss_uncond": -4.037688255310059, "incorrect_loss_uncond": -5.314761479695638}, "model_output": [{"sum_logits": -4.657878875732422, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.264041900634766, "logits_per_token": -4.657878875732422, "logits_per_char": -0.5822348594665527, "num_chars": 8}, {"sum_logits": -8.134476661682129, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.172164916992188, "logits_per_token": -8.134476661682129, "logits_per_char": -1.3557461102803547, "num_chars": 6}, {"sum_logits": -8.46116828918457, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -13.054412841796875, "logits_per_token": -8.46116828918457, "logits_per_char": -0.7691971171985973, "num_chars": 11}, {"sum_logits": -9.669479370117188, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.414356231689453, "logits_per_token": -9.669479370117188, "logits_per_char": -1.0743865966796875, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 721, "native_id": "Mercury_SC_400133", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.146198272705078, "incorrect_loss_raw": 7.921815872192383, "correct_loss_per_char": 0.47641321818033855, "incorrect_loss_per_char": 0.48168206744723857, "correct_loss_per_token": 1.7865495681762695, "incorrect_loss_per_token": 2.33879288037618, "correct_loss_uncond": -15.54638671875, "incorrect_loss_uncond": -12.988925298055014}, "model_output": [{"sum_logits": -4.541182518005371, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -16.36111831665039, "logits_per_token": -1.5137275060017903, "logits_per_char": -0.3027455012003581, "num_chars": 15}, {"sum_logits": -7.146198272705078, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -22.692584991455078, "logits_per_token": -1.7865495681762695, "logits_per_char": -0.47641321818033855, "num_chars": 15}, {"sum_logits": -10.865246772766113, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -23.741975784301758, "logits_per_token": -2.7163116931915283, "logits_per_char": -0.7243497848510743, "num_chars": 15}, {"sum_logits": -8.359018325805664, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -22.62912940979004, "logits_per_token": -2.786339441935221, "logits_per_char": -0.4179509162902832, "num_chars": 20}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 722, "native_id": "MSA_2013_5_11", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.61562967300415, "incorrect_loss_raw": 5.590022881825765, "correct_loss_per_char": 0.5077086448669433, "incorrect_loss_per_char": 0.43846639703821255, "correct_loss_per_token": 2.538543224334717, "incorrect_loss_per_token": 2.1950092580583362, "correct_loss_uncond": -14.60178518295288, "incorrect_loss_uncond": -13.62149985631307}, "model_output": [{"sum_logits": -3.905441999435425, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -17.83965492248535, "logits_per_token": -1.9527209997177124, "logits_per_char": -0.43393799993726945, "num_chars": 9}, {"sum_logits": -2.064587354660034, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -15.407942771911621, "logits_per_token": -1.032293677330017, "logits_per_char": -0.2064587354660034, "num_chars": 10}, {"sum_logits": -10.800039291381836, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -24.38697052001953, "logits_per_token": -3.600013097127279, "logits_per_char": -0.6750024557113647, "num_chars": 16}, {"sum_logits": -7.61562967300415, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -22.21741485595703, "logits_per_token": -2.538543224334717, "logits_per_char": -0.5077086448669433, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 723, "native_id": "Mercury_SC_408706", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 1.81304132938385, "incorrect_loss_raw": 2.893392562866211, "correct_loss_per_char": 0.36260826587677003, "incorrect_loss_per_char": 0.3665971076046979, "correct_loss_per_token": 1.81304132938385, "incorrect_loss_per_token": 2.893392562866211, "correct_loss_uncond": -8.26965081691742, "incorrect_loss_uncond": -7.773975690205892}, "model_output": [{"sum_logits": -3.04065203666687, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -10.553168296813965, "logits_per_token": -3.04065203666687, "logits_per_char": -0.5067753394444784, "num_chars": 6}, {"sum_logits": -1.81304132938385, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": true, "sum_logits_uncond": -10.08269214630127, "logits_per_token": -1.81304132938385, "logits_per_char": -0.36260826587677003, "num_chars": 5}, {"sum_logits": -3.023818016052246, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -10.294790267944336, "logits_per_token": -3.023818016052246, "logits_per_char": -0.3023818016052246, "num_chars": 10}, {"sum_logits": -2.6157076358795166, "num_tokens": 1, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -11.154146194458008, "logits_per_token": -2.6157076358795166, "logits_per_char": -0.2906341817643907, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 724, "native_id": "Mercury_7213325", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.314695358276367, "incorrect_loss_raw": 13.769046783447266, "correct_loss_per_char": 0.30795167993616174, "incorrect_loss_per_char": 0.537684488051977, "correct_loss_per_token": 1.6629390716552734, "incorrect_loss_per_token": 3.2661283016204834, "correct_loss_uncond": -18.903791427612305, "incorrect_loss_uncond": -12.062718073527018}, "model_output": [{"sum_logits": -14.932893753051758, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.2480525970459, "logits_per_token": -3.7332234382629395, "logits_per_char": -0.5743420674250677, "num_chars": 26}, {"sum_logits": -15.806242942810059, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.637157440185547, "logits_per_token": -3.9515607357025146, "logits_per_char": -0.6322497177124023, "num_chars": 25}, {"sum_logits": -10.56800365447998, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.610084533691406, "logits_per_token": -2.113600730895996, "logits_per_char": -0.4064616790184608, "num_chars": 26}, {"sum_logits": -8.314695358276367, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -27.218486785888672, "logits_per_token": -1.6629390716552734, "logits_per_char": -0.30795167993616174, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 725, "native_id": "Mercury_SC_LBS10932", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.459920883178711, "incorrect_loss_raw": 19.127047856648762, "correct_loss_per_char": 0.4381794207023852, "incorrect_loss_per_char": 0.5061868579299361, "correct_loss_per_token": 2.4099868138631186, "incorrect_loss_per_token": 2.3964121594000116, "correct_loss_uncond": -18.343538284301758, "incorrect_loss_uncond": -17.31158383687337}, "model_output": [{"sum_logits": -19.58399200439453, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -35.71218490600586, "logits_per_token": -2.797713143484933, "logits_per_char": -0.6119997501373291, "num_chars": 32}, {"sum_logits": -14.459920883178711, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -32.80345916748047, "logits_per_token": -2.4099868138631186, "logits_per_char": -0.4381794207023852, "num_chars": 33}, {"sum_logits": -23.984683990478516, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -39.52252197265625, "logits_per_token": -2.664964887830946, "logits_per_char": -0.5996170997619629, "num_chars": 40}, {"sum_logits": -13.812467575073242, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -34.0811882019043, "logits_per_token": -1.7265584468841553, "logits_per_char": -0.3069437238905165, "num_chars": 45}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 726, "native_id": "Mercury_192220", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 2.051225185394287, "incorrect_loss_raw": 8.074725985527039, "correct_loss_per_char": 0.2564031481742859, "incorrect_loss_per_char": 0.6759723732905218, "correct_loss_per_token": 2.051225185394287, "incorrect_loss_per_token": 3.391222139199575, "correct_loss_uncond": -10.839428424835205, "incorrect_loss_uncond": -6.756238420804341}, "model_output": [{"sum_logits": -2.7457478046417236, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -2.7457478046417236, "logits_per_char": -0.3922496863773891, "num_chars": 7}, {"sum_logits": -1.6106513738632202, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": true, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -0.8053256869316101, "logits_per_char": -0.10737675825754801, "num_chars": 15}, {"sum_logits": -19.867778778076172, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -19.862716674804688, "logits_per_token": -6.622592926025391, "logits_per_char": -1.5282906752366285, "num_chars": 13}, {"sum_logits": -2.051225185394287, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.890653610229492, "logits_per_token": -2.051225185394287, "logits_per_char": -0.2564031481742859, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 727, "native_id": "Mercury_SC_407247", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.395179748535156, "incorrect_loss_raw": 11.712845166524252, "correct_loss_per_char": 0.3929372327081088, "incorrect_loss_per_char": 0.4065386643508804, "correct_loss_per_token": 2.848794937133789, "incorrect_loss_per_token": 2.34256903330485, "correct_loss_uncond": -8.982648849487305, "incorrect_loss_uncond": -10.327823321024576}, "model_output": [{"sum_logits": -13.951094627380371, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -24.104063034057617, "logits_per_token": -2.7902189254760743, "logits_per_char": -0.48107222853035764, "num_chars": 29}, {"sum_logits": -9.662384033203125, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -21.027023315429688, "logits_per_token": -1.932476806640625, "logits_per_char": -0.31168980752268144, "num_chars": 31}, {"sum_logits": -11.395179748535156, "num_tokens": 4, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -20.37782859802246, "logits_per_token": -2.848794937133789, "logits_per_char": -0.3929372327081088, "num_chars": 29}, {"sum_logits": -11.525056838989258, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -20.99091911315918, "logits_per_token": -2.3050113677978517, "logits_per_char": -0.42685395699960216, "num_chars": 27}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 728, "native_id": "Mercury_7024798", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.8904266357421875, "incorrect_loss_raw": 4.49224050839742, "correct_loss_per_char": 0.48904266357421877, "incorrect_loss_per_char": 0.5656258647059981, "correct_loss_per_token": 2.4452133178710938, "incorrect_loss_per_token": 3.7595587571461997, "correct_loss_uncond": -11.866409301757812, "incorrect_loss_uncond": -10.336392482121786}, "model_output": [{"sum_logits": -3.530748128890991, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.572683334350586, "logits_per_token": -3.530748128890991, "logits_per_char": -0.5043925898415702, "num_chars": 7}, {"sum_logits": -5.549882888793945, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -13.00870132446289, "logits_per_token": -5.549882888793945, "logits_per_char": -0.7928404126848493, "num_chars": 7}, {"sum_logits": -4.396090507507324, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.90451431274414, "logits_per_token": -2.198045253753662, "logits_per_char": -0.39964459159157495, "num_chars": 11}, {"sum_logits": -4.8904266357421875, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.7568359375, "logits_per_token": -2.4452133178710938, "logits_per_char": -0.48904266357421877, "num_chars": 10}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 729, "native_id": "Mercury_7180810", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.140426635742188, "incorrect_loss_raw": 15.277434666951498, "correct_loss_per_char": 0.4247480693616365, "incorrect_loss_per_char": 0.3644311493408608, "correct_loss_per_token": 2.6900711059570312, "incorrect_loss_per_token": 2.01890162059239, "correct_loss_uncond": -18.831527709960938, "incorrect_loss_uncond": -21.120181401570637}, "model_output": [{"sum_logits": -11.451126098632812, "num_tokens": 8, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -36.93325424194336, "logits_per_token": -1.4313907623291016, "logits_per_char": -0.2245318842869179, "num_chars": 51}, {"sum_logits": -16.03183364868164, "num_tokens": 8, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -36.956233978271484, "logits_per_token": -2.003979206085205, "logits_per_char": -0.3728333406670149, "num_chars": 43}, {"sum_logits": -16.140426635742188, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -34.971954345703125, "logits_per_token": -2.6900711059570312, "logits_per_char": -0.4247480693616365, "num_chars": 38}, {"sum_logits": -18.34934425354004, "num_tokens": 7, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -35.30335998535156, "logits_per_token": -2.621334893362863, "logits_per_char": -0.4959282230686497, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 730, "native_id": "Mercury_412780", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.742780685424805, "incorrect_loss_raw": 36.4543342590332, "correct_loss_per_char": 0.435496677051891, "incorrect_loss_per_char": 0.5100492568359533, "correct_loss_per_token": 2.2109831296480618, "incorrect_loss_per_token": 2.5180997473561866, "correct_loss_uncond": -17.89317512512207, "incorrect_loss_uncond": -13.759686787923178}, "model_output": [{"sum_logits": -25.68465805053711, "num_tokens": 13, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -43.7322998046875, "logits_per_token": -1.9757429269643931, "logits_per_char": -0.40769298492916045, "num_chars": 63}, {"sum_logits": -28.742780685424805, "num_tokens": 13, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -46.635955810546875, "logits_per_token": -2.2109831296480618, "logits_per_char": -0.435496677051891, "num_chars": 66}, {"sum_logits": -39.6199951171875, "num_tokens": 15, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -51.81446838378906, "logits_per_token": -2.6413330078125, "logits_per_char": -0.5427396591395548, "num_chars": 73}, {"sum_logits": -44.058349609375, "num_tokens": 15, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -55.09529495239258, "logits_per_token": -2.9372233072916667, "logits_per_char": -0.5797151264391447, "num_chars": 76}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 731, "native_id": "LEAP_2011_8_10434", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 46.1012077331543, "incorrect_loss_raw": 31.485636393229168, "correct_loss_per_char": 0.7317652021135602, "incorrect_loss_per_char": 0.5232628581756981, "correct_loss_per_token": 3.8417673110961914, "incorrect_loss_per_token": 2.5517486507057123, "correct_loss_uncond": -8.132274627685547, "incorrect_loss_uncond": -9.452568054199219}, "model_output": [{"sum_logits": -33.72145080566406, "num_tokens": 13, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -43.275299072265625, "logits_per_token": -2.593957754281851, "logits_per_char": -0.5438943678332914, "num_chars": 62}, {"sum_logits": -46.1012077331543, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -54.233482360839844, "logits_per_token": -3.8417673110961914, "logits_per_char": -0.7317652021135602, "num_chars": 63}, {"sum_logits": -37.00783157348633, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -43.284149169921875, "logits_per_token": -3.083985964457194, "logits_per_char": -0.6167971928914388, "num_chars": 60}, {"sum_logits": -23.72762680053711, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -36.255165100097656, "logits_per_token": -1.9773022333780925, "logits_per_char": -0.40909701380236396, "num_chars": 58}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 732, "native_id": "Mercury_7200340", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.921899795532227, "incorrect_loss_raw": 13.618229230244955, "correct_loss_per_char": 0.34804749488830566, "incorrect_loss_per_char": 0.4296481271281319, "correct_loss_per_token": 1.5468777550591364, "incorrect_loss_per_token": 1.9065806464543422, "correct_loss_uncond": -18.084508895874023, "incorrect_loss_uncond": -17.312254905700684}, "model_output": [{"sum_logits": -5.997856140136719, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.730066299438477, "logits_per_token": -1.1995712280273438, "logits_per_char": -0.2607763539189878, "num_chars": 23}, {"sum_logits": -20.38646697998047, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -38.31245803833008, "logits_per_token": -2.9123524257114957, "logits_per_char": -0.6370770931243896, "num_chars": 32}, {"sum_logits": -14.470364570617676, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -32.74892807006836, "logits_per_token": -1.6078182856241863, "logits_per_char": -0.39109093434101827, "num_chars": 37}, {"sum_logits": -13.921899795532227, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -32.00640869140625, "logits_per_token": -1.5468777550591364, "logits_per_char": -0.34804749488830566, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 733, "native_id": "Mercury_7056525", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.493667602539062, "incorrect_loss_raw": 26.947739283243816, "correct_loss_per_char": 0.6831222534179687, "incorrect_loss_per_char": 0.6284460093532996, "correct_loss_per_token": 3.4156112670898438, "incorrect_loss_per_token": 3.6520501545497353, "correct_loss_uncond": -12.175342559814453, "incorrect_loss_uncond": -13.32710075378418}, "model_output": [{"sum_logits": -33.201316833496094, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -52.297122955322266, "logits_per_token": -4.150164604187012, "logits_per_char": -0.6775778945611448, "num_chars": 49}, {"sum_logits": -26.07166862487793, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.548439025878906, "logits_per_token": -3.7245240892682756, "logits_per_char": -0.6685043237148187, "num_chars": 39}, {"sum_logits": -20.493667602539062, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.669010162353516, "logits_per_token": -3.4156112670898438, "logits_per_char": -0.6831222534179687, "num_chars": 30}, {"sum_logits": -21.570232391357422, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.97895812988281, "logits_per_token": -3.0814617701939175, "logits_per_char": -0.5392558097839355, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 734, "native_id": "Mercury_7085278", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 17.049955368041992, "incorrect_loss_raw": 16.76104958852132, "correct_loss_per_char": 0.6314798284459997, "incorrect_loss_per_char": 0.6575907900976756, "correct_loss_per_token": 2.4357079097202847, "incorrect_loss_per_token": 3.1834671550326874, "correct_loss_uncond": -18.775007247924805, "incorrect_loss_uncond": -13.716944694519043}, "model_output": [{"sum_logits": -15.919748306274414, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -27.496868133544922, "logits_per_token": -3.183949661254883, "logits_per_char": -0.6633228460947672, "num_chars": 24}, {"sum_logits": -19.176551818847656, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.78403091430664, "logits_per_token": -3.835310363769531, "logits_per_char": -0.7670620727539063, "num_chars": 25}, {"sum_logits": -15.186848640441895, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -29.15308380126953, "logits_per_token": -2.531141440073649, "logits_per_char": -0.5423874514443534, "num_chars": 28}, {"sum_logits": -17.049955368041992, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.8249626159668, "logits_per_token": -2.4357079097202847, "logits_per_char": -0.6314798284459997, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 735, "native_id": "AKDE&ED_2008_4_35", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.251670837402344, "incorrect_loss_raw": 15.300432205200195, "correct_loss_per_char": 0.45399133782637746, "incorrect_loss_per_char": 0.42414606137383254, "correct_loss_per_token": 2.464524405343192, "incorrect_loss_per_token": 2.1857760293143134, "correct_loss_uncond": -14.50252914428711, "incorrect_loss_uncond": -13.976378122965494}, "model_output": [{"sum_logits": -15.88661003112793, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -28.215116500854492, "logits_per_token": -2.269515718732561, "logits_per_char": -0.45390314374651225, "num_chars": 35}, {"sum_logits": -12.7125244140625, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -29.29906463623047, "logits_per_token": -1.8160749162946428, "logits_per_char": -0.3632149832589286, "num_chars": 35}, {"sum_logits": -17.302162170410156, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.31624984741211, "logits_per_token": -2.4717374529157365, "logits_per_char": -0.4553200571160567, "num_chars": 38}, {"sum_logits": -17.251670837402344, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.754199981689453, "logits_per_token": -2.464524405343192, "logits_per_char": -0.45399133782637746, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 736, "native_id": "MCAS_1999_8_16", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 30.578060150146484, "incorrect_loss_raw": 27.994656880696613, "correct_loss_per_char": 0.6370429197947184, "incorrect_loss_per_char": 0.7564473310978901, "correct_loss_per_token": 3.8222575187683105, "incorrect_loss_per_token": 3.500521392418594, "correct_loss_uncond": -20.70345687866211, "incorrect_loss_uncond": -19.469991048177082}, "model_output": [{"sum_logits": -23.510536193847656, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -39.722110748291016, "logits_per_token": -3.358648027692522, "logits_per_char": -0.8107081446154364, "num_chars": 29}, {"sum_logits": -29.970947265625, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -50.32599639892578, "logits_per_token": -3.330105251736111, "logits_per_char": -0.749273681640625, "num_chars": 40}, {"sum_logits": -30.502487182617188, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -52.3458366394043, "logits_per_token": -3.8128108978271484, "logits_per_char": -0.709360167037609, "num_chars": 43}, {"sum_logits": -30.578060150146484, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -51.281517028808594, "logits_per_token": -3.8222575187683105, "logits_per_char": -0.6370429197947184, "num_chars": 48}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 737, "native_id": "Mercury_SC_400063", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.334150314331055, "incorrect_loss_raw": 8.258989810943604, "correct_loss_per_char": 1.121286392211914, "incorrect_loss_per_char": 0.7000565439948946, "correct_loss_per_token": 6.167075157165527, "incorrect_loss_per_token": 3.0576408439212375, "correct_loss_uncond": -3.9550838470458984, "incorrect_loss_uncond": -8.36817185084025}, "model_output": [{"sum_logits": -5.094060897827148, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -17.391294479370117, "logits_per_token": -1.698020299275716, "logits_per_char": -0.5094060897827148, "num_chars": 10}, {"sum_logits": -5.483596324920654, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.731874465942383, "logits_per_token": -2.741798162460327, "logits_per_char": -0.49850875681096857, "num_chars": 11}, {"sum_logits": -14.199312210083008, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.758316040039062, "logits_per_token": -4.733104070027669, "logits_per_char": -1.0922547853910005, "num_chars": 13}, {"sum_logits": -12.334150314331055, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.289234161376953, "logits_per_token": -6.167075157165527, "logits_per_char": -1.121286392211914, "num_chars": 11}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 738, "native_id": "Mercury_SC_401666", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.06794548034668, "incorrect_loss_raw": 19.820178349812824, "correct_loss_per_char": 0.43426879676612645, "incorrect_loss_per_char": 0.6651641692950095, "correct_loss_per_token": 2.6779909133911133, "incorrect_loss_per_token": 2.9819155042133634, "correct_loss_uncond": -14.068912506103516, "incorrect_loss_uncond": -10.046720504760742}, "model_output": [{"sum_logits": -16.829875946044922, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.242631912231445, "logits_per_token": -2.4042679922921315, "logits_per_char": -0.6010669980730329, "num_chars": 28}, {"sum_logits": -18.95814323425293, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.68186378479004, "logits_per_token": -3.159690539042155, "logits_per_char": -0.6770765440804618, "num_chars": 28}, {"sum_logits": -23.672515869140625, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -35.67620086669922, "logits_per_token": -3.3817879813058034, "logits_per_char": -0.7173489657315341, "num_chars": 33}, {"sum_logits": -16.06794548034668, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.136857986450195, "logits_per_token": -2.6779909133911133, "logits_per_char": -0.43426879676612645, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 739, "native_id": "TIMSS_2011_8_pg31", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.619351387023926, "incorrect_loss_raw": 14.15293025970459, "correct_loss_per_char": 0.3933093106305158, "incorrect_loss_per_char": 0.4723864312605424, "correct_loss_per_token": 1.769891897837321, "incorrect_loss_per_token": 2.3394545555114745, "correct_loss_uncond": -13.257498741149902, "incorrect_loss_uncond": -13.843938509623209}, "model_output": [{"sum_logits": -10.658061027526855, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -24.822025299072266, "logits_per_token": -2.131612205505371, "logits_per_char": -0.4844573194330389, "num_chars": 22}, {"sum_logits": -12.15546989440918, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -27.70379066467285, "logits_per_token": -2.4310939788818358, "logits_per_char": -0.4862187957763672, "num_chars": 25}, {"sum_logits": -10.619351387023926, "num_tokens": 6, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -23.876850128173828, "logits_per_token": -1.769891897837321, "logits_per_char": -0.3933093106305158, "num_chars": 27}, {"sum_logits": -19.645259857177734, "num_tokens": 8, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -31.46479034423828, "logits_per_token": -2.455657482147217, "logits_per_char": -0.44648317857222125, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 740, "native_id": "Mercury_412673", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.643874168395996, "incorrect_loss_raw": 15.716763496398926, "correct_loss_per_char": 0.6643874168395996, "incorrect_loss_per_char": 0.9262307988437314, "correct_loss_per_token": 2.2146247227986655, "incorrect_loss_per_token": 2.6587574534946015, "correct_loss_uncond": -7.79927921295166, "incorrect_loss_uncond": -11.081665992736816}, "model_output": [{"sum_logits": -5.7439374923706055, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.516294479370117, "logits_per_token": -2.8719687461853027, "logits_per_char": -0.5743937492370605, "num_chars": 10}, {"sum_logits": -6.643874168395996, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.443153381347656, "logits_per_token": -2.2146247227986655, "logits_per_char": -0.6643874168395996, "num_chars": 10}, {"sum_logits": -14.455024719238281, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.16781997680664, "logits_per_token": -2.4091707865397134, "logits_per_char": -1.0325017656598772, "num_chars": 14}, {"sum_logits": -26.95132827758789, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -33.71117401123047, "logits_per_token": -2.695132827758789, "logits_per_char": -1.171796881634256, "num_chars": 23}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 741, "native_id": "Mercury_7130655", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.985607147216797, "incorrect_loss_raw": 27.414371490478516, "correct_loss_per_char": 0.39040011167526245, "incorrect_loss_per_char": 0.4676136745799357, "correct_loss_per_token": 2.4985607147216795, "incorrect_loss_per_token": 2.5626254804206616, "correct_loss_uncond": -9.303535461425781, "incorrect_loss_uncond": -14.876778920491537}, "model_output": [{"sum_logits": -23.23526382446289, "num_tokens": 10, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -38.24528121948242, "logits_per_token": -2.323526382446289, "logits_per_char": -0.528074177828702, "num_chars": 44}, {"sum_logits": -21.359657287597656, "num_tokens": 11, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -42.62361145019531, "logits_per_token": -1.9417870261452415, "logits_per_char": -0.34451060141286544, "num_chars": 62}, {"sum_logits": -37.648193359375, "num_tokens": 11, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -46.00455856323242, "logits_per_token": -3.4225630326704546, "logits_per_char": -0.5302562444982394, "num_chars": 71}, {"sum_logits": -24.985607147216797, "num_tokens": 10, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -34.28914260864258, "logits_per_token": -2.4985607147216795, "logits_per_char": -0.39040011167526245, "num_chars": 64}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 742, "native_id": "MCAS_2004_5_7", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.320727348327637, "incorrect_loss_raw": 9.413907686869303, "correct_loss_per_char": 0.28156643647413987, "incorrect_loss_per_char": 0.36207337257189626, "correct_loss_per_token": 1.0458181926182337, "incorrect_loss_per_token": 1.3448439552670433, "correct_loss_uncond": -18.239327430725098, "incorrect_loss_uncond": -15.172167778015137}, "model_output": [{"sum_logits": -10.10055923461914, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -23.755294799804688, "logits_per_token": -1.4429370335170202, "logits_per_char": -0.38848304748535156, "num_chars": 26}, {"sum_logits": -7.320727348327637, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -25.560054779052734, "logits_per_token": -1.0458181926182337, "logits_per_char": -0.28156643647413987, "num_chars": 26}, {"sum_logits": -8.730001449584961, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -25.536882400512695, "logits_per_token": -1.247143064226423, "logits_per_char": -0.3357692865224985, "num_chars": 26}, {"sum_logits": -9.411162376403809, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -24.466049194335938, "logits_per_token": -1.3444517680576868, "logits_per_char": -0.3619677837078388, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 743, "native_id": "Mercury_7187373", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.757638931274414, "incorrect_loss_raw": 16.464483896891277, "correct_loss_per_char": 0.6860933059301132, "incorrect_loss_per_char": 0.49683222387652853, "correct_loss_per_token": 3.3447048664093018, "incorrect_loss_per_token": 2.9792278713650178, "correct_loss_uncond": -17.286821365356445, "incorrect_loss_uncond": -16.307335535685223}, "model_output": [{"sum_logits": -13.262163162231445, "num_tokens": 6, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.808950424194336, "logits_per_token": -2.210360527038574, "logits_per_char": -0.4278117149106918, "num_chars": 31}, {"sum_logits": -14.968038558959961, "num_tokens": 6, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -33.9312744140625, "logits_per_token": -2.4946730931599936, "logits_per_char": -0.4402364282047047, "num_chars": 34}, {"sum_logits": -21.163249969482422, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.575233459472656, "logits_per_token": -4.2326499938964846, "logits_per_char": -0.6224485285141889, "num_chars": 34}, {"sum_logits": -26.757638931274414, "num_tokens": 8, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -44.04446029663086, "logits_per_token": -3.3447048664093018, "logits_per_char": -0.6860933059301132, "num_chars": 39}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 744, "native_id": "Mercury_SC_401361", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.665249347686768, "incorrect_loss_raw": 6.515600840250651, "correct_loss_per_char": 0.708156168460846, "incorrect_loss_per_char": 0.8387761286326817, "correct_loss_per_token": 5.665249347686768, "incorrect_loss_per_token": 6.515600840250651, "correct_loss_uncond": -7.357018947601318, "incorrect_loss_uncond": -8.364208221435547}, "model_output": [{"sum_logits": -4.0867719650268555, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.428329467773438, "logits_per_token": -4.0867719650268555, "logits_per_char": -0.5838245664324079, "num_chars": 7}, {"sum_logits": -6.695059776306152, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.878938674926758, "logits_per_token": -6.695059776306152, "logits_per_char": -0.836882472038269, "num_chars": 8}, {"sum_logits": -5.665249347686768, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.022268295288086, "logits_per_token": -5.665249347686768, "logits_per_char": -0.708156168460846, "num_chars": 8}, {"sum_logits": -8.764970779418945, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.3321590423584, "logits_per_token": -8.764970779418945, "logits_per_char": -1.0956213474273682, "num_chars": 8}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 745, "native_id": "MCAS_2006_8_12", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.310482025146484, "incorrect_loss_raw": 18.38187567392985, "correct_loss_per_char": 0.46480170885721844, "incorrect_loss_per_char": 0.47498973189202504, "correct_loss_per_token": 2.478942447238498, "incorrect_loss_per_token": 2.4834099610646567, "correct_loss_uncond": -14.473445892333984, "incorrect_loss_uncond": -12.026225090026855}, "model_output": [{"sum_logits": -23.9521427154541, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -34.74396514892578, "logits_per_token": -2.9940178394317627, "logits_per_char": -0.6141575055244641, "num_chars": 39}, {"sum_logits": -14.92296314239502, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -29.052539825439453, "logits_per_token": -2.1318518774850026, "logits_per_char": -0.38264008057423127, "num_chars": 39}, {"sum_logits": -16.27052116394043, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -27.427797317504883, "logits_per_token": -2.3243601662772044, "logits_per_char": -0.4281716095773797, "num_chars": 38}, {"sum_logits": -22.310482025146484, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -36.78392791748047, "logits_per_token": -2.478942447238498, "logits_per_char": -0.46480170885721844, "num_chars": 48}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 746, "native_id": "Mercury_7233765", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 1.7144001722335815, "incorrect_loss_raw": 2.3699145317077637, "correct_loss_per_char": 0.17144001722335817, "incorrect_loss_per_char": 0.2032531439655959, "correct_loss_per_token": 0.8572000861167908, "incorrect_loss_per_token": 1.565134843190511, "correct_loss_uncond": -13.865272641181946, "incorrect_loss_uncond": -13.91369358698527}, "model_output": [{"sum_logits": -2.2810654640197754, "num_tokens": 1, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -14.164134979248047, "logits_per_token": -2.2810654640197754, "logits_per_char": -0.2073695876381614, "num_chars": 11}, {"sum_logits": -1.7144001722335815, "num_tokens": 2, "num_tokens_all": 215, "is_greedy": true, "sum_logits_uncond": -15.579672813415527, "logits_per_token": -0.8572000861167908, "logits_per_char": -0.17144001722335817, "num_chars": 10}, {"sum_logits": -2.716322898864746, "num_tokens": 2, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -17.323625564575195, "logits_per_token": -1.358161449432373, "logits_per_char": -0.22636024157206217, "num_chars": 12}, {"sum_logits": -2.1123552322387695, "num_tokens": 2, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -17.36306381225586, "logits_per_token": -1.0561776161193848, "logits_per_char": -0.17602960268656412, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 747, "native_id": "Mercury_SC_407613", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.624216079711914, "incorrect_loss_raw": 19.19152577718099, "correct_loss_per_char": 0.7312108039855957, "incorrect_loss_per_char": 0.7741120036174504, "correct_loss_per_token": 3.6560540199279785, "incorrect_loss_per_token": 4.456537246704102, "correct_loss_uncond": -8.49697494506836, "incorrect_loss_uncond": -9.103349685668945}, "model_output": [{"sum_logits": -20.48065185546875, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -33.60041809082031, "logits_per_token": -4.09613037109375, "logits_per_char": -0.6606661888860887, "num_chars": 31}, {"sum_logits": -17.59087371826172, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -26.579822540283203, "logits_per_token": -4.39771842956543, "logits_per_char": -0.7329530715942383, "num_chars": 24}, {"sum_logits": -14.624216079711914, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -23.121191024780273, "logits_per_token": -3.6560540199279785, "logits_per_char": -0.7312108039855957, "num_chars": 20}, {"sum_logits": -19.5030517578125, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -24.70438575744629, "logits_per_token": -4.875762939453125, "logits_per_char": -0.9287167503720238, "num_chars": 21}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 748, "native_id": "MCAS_2005_5_24", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.195866584777832, "incorrect_loss_raw": 10.712603569030762, "correct_loss_per_char": 0.2797244389851888, "incorrect_loss_per_char": 0.521659301233224, "correct_loss_per_token": 1.048966646194458, "incorrect_loss_per_token": 2.36786109606425, "correct_loss_uncond": -12.755467414855957, "incorrect_loss_uncond": -11.29834270477295}, "model_output": [{"sum_logits": -4.195866584777832, "num_tokens": 4, "num_tokens_all": 205, "is_greedy": true, "sum_logits_uncond": -16.95133399963379, "logits_per_token": -1.048966646194458, "logits_per_char": -0.2797244389851888, "num_chars": 15}, {"sum_logits": -13.52042293548584, "num_tokens": 4, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -20.74639129638672, "logits_per_token": -3.38010573387146, "logits_per_char": -0.7116012071308336, "num_chars": 19}, {"sum_logits": -10.607900619506836, "num_tokens": 5, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -23.47452735900879, "logits_per_token": -2.121580123901367, "logits_per_char": -0.5051381247384208, "num_chars": 21}, {"sum_logits": -8.00948715209961, "num_tokens": 5, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -21.811920166015625, "logits_per_token": -1.6018974304199218, "logits_per_char": -0.3482385718304178, "num_chars": 23}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 749, "native_id": "Mercury_405778", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.29433536529541, "incorrect_loss_raw": 3.4979304472605386, "correct_loss_per_char": 0.552955691019694, "incorrect_loss_per_char": 0.46203506787618, "correct_loss_per_token": 4.147167682647705, "incorrect_loss_per_token": 2.81017271677653, "correct_loss_uncond": -9.558926582336426, "incorrect_loss_uncond": -9.686651945114136}, "model_output": [{"sum_logits": -3.939283609390259, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -3.939283609390259, "logits_per_char": -0.7878567218780518, "num_chars": 5}, {"sum_logits": -2.4279613494873047, "num_tokens": 1, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -11.986562728881836, "logits_per_token": -2.4279613494873047, "logits_per_char": -0.3034951686859131, "num_chars": 8}, {"sum_logits": -8.29433536529541, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.853261947631836, "logits_per_token": -4.147167682647705, "logits_per_char": -0.552955691019694, "num_chars": 15}, {"sum_logits": -4.126546382904053, "num_tokens": 2, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -15.103519439697266, "logits_per_token": -2.0632731914520264, "logits_per_char": -0.2947533130645752, "num_chars": 14}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 750, "native_id": "Mercury_7263060", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.265323638916016, "incorrect_loss_raw": 9.246317704518637, "correct_loss_per_char": 0.9245792886485225, "incorrect_loss_per_char": 0.47097619827727827, "correct_loss_per_token": 4.253064727783203, "incorrect_loss_per_token": 2.311579426129659, "correct_loss_uncond": -11.102989196777344, "incorrect_loss_uncond": -12.80842955907186}, "model_output": [{"sum_logits": -4.287887096405029, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": true, "sum_logits_uncond": -21.73403549194336, "logits_per_token": -1.0719717741012573, "logits_per_char": -0.17866196235020956, "num_chars": 24}, {"sum_logits": -10.951847076416016, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -22.999103546142578, "logits_per_token": -2.737961769104004, "logits_per_char": -0.5764130040218955, "num_chars": 19}, {"sum_logits": -12.499218940734863, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -21.431102752685547, "logits_per_token": -3.124804735183716, "logits_per_char": -0.6578536284597296, "num_chars": 19}, {"sum_logits": -21.265323638916016, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -32.36831283569336, "logits_per_token": -4.253064727783203, "logits_per_char": -0.9245792886485225, "num_chars": 23}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 751, "native_id": "Mercury_SC_401668", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 24.486225128173828, "incorrect_loss_raw": 22.982763926188152, "correct_loss_per_char": 0.5830053601946149, "incorrect_loss_per_char": 0.6722759281326018, "correct_loss_per_token": 2.720691680908203, "incorrect_loss_per_token": 3.639949616931734, "correct_loss_uncond": -17.57109832763672, "incorrect_loss_uncond": -6.027463277180989}, "model_output": [{"sum_logits": -18.726625442504883, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -24.710853576660156, "logits_per_token": -3.7453250885009766, "logits_per_char": -0.6242208480834961, "num_chars": 30}, {"sum_logits": -22.293304443359375, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -30.642452239990234, "logits_per_token": -3.184757777622768, "logits_per_char": -0.7431101481119792, "num_chars": 30}, {"sum_logits": -27.928361892700195, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.67737579345703, "logits_per_token": -3.9897659846714566, "logits_per_char": -0.6494967882023301, "num_chars": 43}, {"sum_logits": -24.486225128173828, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -42.05732345581055, "logits_per_token": -2.720691680908203, "logits_per_char": -0.5830053601946149, "num_chars": 42}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 752, "native_id": "Mercury_7230388", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 27.371858596801758, "incorrect_loss_raw": 24.703975041707356, "correct_loss_per_char": 0.48020804555792557, "incorrect_loss_per_char": 0.5984811936121753, "correct_loss_per_token": 3.4214823246002197, "incorrect_loss_per_token": 4.2628964469546355, "correct_loss_uncond": -7.390752792358398, "incorrect_loss_uncond": -8.127967834472656}, "model_output": [{"sum_logits": -20.54520034790039, "num_tokens": 4, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -29.389001846313477, "logits_per_token": -5.136300086975098, "logits_per_char": -0.6420375108718872, "num_chars": 32}, {"sum_logits": -23.97214126586914, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -35.44158172607422, "logits_per_token": -3.424591609409877, "logits_per_char": -0.5100455588482796, "num_chars": 47}, {"sum_logits": -29.59458351135254, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -33.665245056152344, "logits_per_token": -4.227797644478934, "logits_per_char": -0.6433605111163595, "num_chars": 46}, {"sum_logits": -27.371858596801758, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -34.762611389160156, "logits_per_token": -3.4214823246002197, "logits_per_char": -0.48020804555792557, "num_chars": 57}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 753, "native_id": "Mercury_7041650", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.155441284179688, "incorrect_loss_raw": 14.139006614685059, "correct_loss_per_char": 0.2684783935546875, "incorrect_loss_per_char": 0.45045918774983235, "correct_loss_per_token": 1.3155441284179688, "incorrect_loss_per_token": 2.7985582692282542, "correct_loss_uncond": -22.325275421142578, "incorrect_loss_uncond": -12.473301887512207}, "model_output": [{"sum_logits": -17.52728271484375, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -26.415267944335938, "logits_per_token": -4.3818206787109375, "logits_per_char": -0.5842427571614583, "num_chars": 30}, {"sum_logits": -20.613414764404297, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -33.53757095336914, "logits_per_token": -2.9447735377720425, "logits_per_char": -0.5889547075544085, "num_chars": 35}, {"sum_logits": -4.276322364807129, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -19.88408660888672, "logits_per_token": -1.0690805912017822, "logits_per_char": -0.17818009853363037, "num_chars": 24}, {"sum_logits": -13.155441284179688, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -35.480716705322266, "logits_per_token": -1.3155441284179688, "logits_per_char": -0.2684783935546875, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 754, "native_id": "Mercury_SC_409009", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.35159969329834, "incorrect_loss_raw": 14.713666280110678, "correct_loss_per_char": 0.4221058733323041, "incorrect_loss_per_char": 0.36189996969132193, "correct_loss_per_token": 2.0502285276140486, "incorrect_loss_per_token": 1.962172720167372, "correct_loss_uncond": -11.788430213928223, "incorrect_loss_uncond": -16.36021677652995}, "model_output": [{"sum_logits": -14.35159969329834, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.140029907226562, "logits_per_token": -2.0502285276140486, "logits_per_char": -0.4221058733323041, "num_chars": 34}, {"sum_logits": -14.39122200012207, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -29.612857818603516, "logits_per_token": -2.398537000020345, "logits_per_char": -0.41117777143205914, "num_chars": 35}, {"sum_logits": -13.136428833007812, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -28.682296752929688, "logits_per_token": -1.6420536041259766, "logits_per_char": -0.3284107208251953, "num_chars": 40}, {"sum_logits": -16.61334800720215, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -34.92649459838867, "logits_per_token": -1.8459275563557942, "logits_per_char": -0.3461114168167114, "num_chars": 48}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 755, "native_id": "Mercury_7223143", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.692493438720703, "incorrect_loss_raw": 14.028545697530111, "correct_loss_per_char": 0.5325479507446289, "incorrect_loss_per_char": 0.5373026333525696, "correct_loss_per_token": 2.517499403520064, "incorrect_loss_per_token": 2.480690598487854, "correct_loss_uncond": -20.502811431884766, "incorrect_loss_uncond": -11.361790657043457}, "model_output": [{"sum_logits": -10.387561798095703, "num_tokens": 4, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -19.73174285888672, "logits_per_token": -2.596890449523926, "logits_per_char": -0.5467137788471422, "num_chars": 19}, {"sum_logits": -21.190126419067383, "num_tokens": 6, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.721519470214844, "logits_per_token": -3.5316877365112305, "logits_per_char": -0.8150048622718225, "num_chars": 26}, {"sum_logits": -10.507948875427246, "num_tokens": 8, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -24.71774673461914, "logits_per_token": -1.3134936094284058, "logits_per_char": -0.25018925893874394, "num_chars": 42}, {"sum_logits": -27.692493438720703, "num_tokens": 11, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -48.19530487060547, "logits_per_token": -2.517499403520064, "logits_per_char": -0.5325479507446289, "num_chars": 52}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 756, "native_id": "ACTAAP_2007_7_3", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.944904327392578, "incorrect_loss_raw": 17.654444694519043, "correct_loss_per_char": 0.6815830323754287, "incorrect_loss_per_char": 0.5914698200629501, "correct_loss_per_token": 3.9921291896275113, "incorrect_loss_per_token": 2.649909897456093, "correct_loss_uncond": -10.05722427368164, "incorrect_loss_uncond": -8.900367418924967}, "model_output": [{"sum_logits": -16.108642578125, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -22.00761604309082, "logits_per_token": -2.6847737630208335, "logits_per_char": -0.644345703125, "num_chars": 25}, {"sum_logits": -13.396613121032715, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -21.940961837768555, "logits_per_token": -1.913801874433245, "logits_per_char": -0.47845046860831125, "num_chars": 28}, {"sum_logits": -23.458078384399414, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -35.715858459472656, "logits_per_token": -3.351154054914202, "logits_per_char": -0.6516132884555392, "num_chars": 36}, {"sum_logits": -27.944904327392578, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -38.00212860107422, "logits_per_token": -3.9921291896275113, "logits_per_char": -0.6815830323754287, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 757, "native_id": "Mercury_7215670", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.425548553466797, "incorrect_loss_raw": 22.06048583984375, "correct_loss_per_char": 0.41063871383666994, "incorrect_loss_per_char": 0.462362978768635, "correct_loss_per_token": 2.3465069362095425, "incorrect_loss_per_token": 2.4824152859774506, "correct_loss_uncond": -16.94522476196289, "incorrect_loss_uncond": -10.75030771891276}, "model_output": [{"sum_logits": -22.587646484375, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -33.865535736083984, "logits_per_token": -2.823455810546875, "logits_per_char": -0.4805882230718085, "num_chars": 47}, {"sum_logits": -19.381011962890625, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -31.844112396240234, "logits_per_token": -2.422626495361328, "logits_per_char": -0.41236195665724734, "num_chars": 47}, {"sum_logits": -16.425548553466797, "num_tokens": 7, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -33.37077331542969, "logits_per_token": -2.3465069362095425, "logits_per_char": -0.41063871383666994, "num_chars": 40}, {"sum_logits": -24.212799072265625, "num_tokens": 11, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -32.72273254394531, "logits_per_token": -2.201163552024148, "logits_per_char": -0.49413875657684947, "num_chars": 49}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 758, "native_id": "MEA_2010_8_15", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.807113647460938, "incorrect_loss_raw": 18.132144292195637, "correct_loss_per_char": 0.6617208627554086, "incorrect_loss_per_char": 0.5610358855134326, "correct_loss_per_token": 3.686730521065848, "incorrect_loss_per_token": 3.023556771101775, "correct_loss_uncond": -12.989395141601562, "incorrect_loss_uncond": -15.99167569478353}, "model_output": [{"sum_logits": -25.807113647460938, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -38.7965087890625, "logits_per_token": -3.686730521065848, "logits_per_char": -0.6617208627554086, "num_chars": 39}, {"sum_logits": -18.648794174194336, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -36.42715072631836, "logits_per_token": -2.331099271774292, "logits_per_char": -0.5040214641674144, "num_chars": 37}, {"sum_logits": -23.29338836669922, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.885467529296875, "logits_per_token": -2.58815426296658, "logits_per_char": -0.6129839043868216, "num_chars": 38}, {"sum_logits": -12.45425033569336, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -26.058841705322266, "logits_per_token": -4.151416778564453, "logits_per_char": -0.5661022879860618, "num_chars": 22}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 759, "native_id": "Mercury_7270515", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 33.89572525024414, "incorrect_loss_raw": 31.877225240071613, "correct_loss_per_char": 0.6276986157452619, "incorrect_loss_per_char": 0.5511476489555701, "correct_loss_per_token": 3.389572525024414, "incorrect_loss_per_token": 2.9054192937985817, "correct_loss_uncond": -10.45694351196289, "incorrect_loss_uncond": -7.66278076171875}, "model_output": [{"sum_logits": -26.47216033935547, "num_tokens": 11, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -36.5667724609375, "logits_per_token": -2.406560030850497, "logits_per_char": -0.48131200617009945, "num_chars": 55}, {"sum_logits": -33.89572525024414, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -44.35266876220703, "logits_per_token": -3.389572525024414, "logits_per_char": -0.6276986157452619, "num_chars": 54}, {"sum_logits": -36.375221252441406, "num_tokens": 12, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -41.095306396484375, "logits_per_token": -3.0312684377034507, "logits_per_char": -0.5866971169748614, "num_chars": 62}, {"sum_logits": -32.78429412841797, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -40.95793914794922, "logits_per_token": -3.278429412841797, "logits_per_char": -0.5854338237217495, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 760, "native_id": "Mercury_7006160", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.162315368652344, "incorrect_loss_raw": 18.372433344523113, "correct_loss_per_char": 0.5405442772842035, "incorrect_loss_per_char": 0.4470590919413895, "correct_loss_per_token": 2.770289421081543, "incorrect_loss_per_token": 2.74589294857449, "correct_loss_uncond": -20.269664764404297, "incorrect_loss_uncond": -15.412772496541342}, "model_output": [{"sum_logits": -15.278711318969727, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -24.035083770751953, "logits_per_token": -2.5464518864949546, "logits_per_char": -0.4244086477491591, "num_chars": 36}, {"sum_logits": -19.823993682861328, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -37.19711685180664, "logits_per_token": -2.8319990975516185, "logits_per_char": -0.47199984959193636, "num_chars": 42}, {"sum_logits": -22.162315368652344, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -42.43198013305664, "logits_per_token": -2.770289421081543, "logits_per_char": -0.5405442772842035, "num_chars": 41}, {"sum_logits": -20.01459503173828, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -40.123416900634766, "logits_per_token": -2.859227861676897, "logits_per_char": -0.4447687784830729, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 761, "native_id": "Mercury_SC_410630", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.887765884399414, "incorrect_loss_raw": 11.857359250386557, "correct_loss_per_char": 0.5786569118499756, "incorrect_loss_per_char": 0.5036906662485958, "correct_loss_per_token": 3.4719414710998535, "incorrect_loss_per_token": 3.0214120229085286, "correct_loss_uncond": -13.606576919555664, "incorrect_loss_uncond": -11.331287701924643}, "model_output": [{"sum_logits": -10.700075149536133, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -24.562591552734375, "logits_per_token": -2.1400150299072265, "logits_per_char": -0.35666917165120443, "num_chars": 30}, {"sum_logits": -16.397357940673828, "num_tokens": 4, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -24.4005184173584, "logits_per_token": -4.099339485168457, "logits_per_char": -0.6558943176269532, "num_chars": 25}, {"sum_logits": -13.887765884399414, "num_tokens": 4, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -27.494342803955078, "logits_per_token": -3.4719414710998535, "logits_per_char": -0.5786569118499756, "num_chars": 24}, {"sum_logits": -8.474644660949707, "num_tokens": 3, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -20.60283088684082, "logits_per_token": -2.8248815536499023, "logits_per_char": -0.4985085094676298, "num_chars": 17}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 762, "native_id": "Mercury_7082320", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.357772827148438, "incorrect_loss_raw": 15.746854464213053, "correct_loss_per_char": 0.44525909423828125, "incorrect_loss_per_char": 0.4700640372083573, "correct_loss_per_token": 1.6697216033935547, "incorrect_loss_per_token": 2.1824110424707808, "correct_loss_uncond": -26.613155364990234, "incorrect_loss_uncond": -23.188969612121582}, "model_output": [{"sum_logits": -13.357772827148438, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -39.97092819213867, "logits_per_token": -1.6697216033935547, "logits_per_char": -0.44525909423828125, "num_chars": 30}, {"sum_logits": -8.613146781921387, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -39.305419921875, "logits_per_token": -1.2304495402744837, "logits_per_char": -0.2778434445781092, "num_chars": 31}, {"sum_logits": -18.4472713470459, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.5455322265625, "logits_per_token": -3.07454522450765, "logits_per_char": -0.61490904490153, "num_chars": 30}, {"sum_logits": -20.180145263671875, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -39.956520080566406, "logits_per_token": -2.2422383626302085, "logits_per_char": -0.5174396221454327, "num_chars": 39}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 763, "native_id": "MEA_2013_8_1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.606757164001465, "incorrect_loss_raw": 22.033283869425457, "correct_loss_per_char": 0.46691693200005424, "incorrect_loss_per_char": 0.5965034050411648, "correct_loss_per_token": 2.101126194000244, "incorrect_loss_per_token": 2.7023716162121487, "correct_loss_uncond": -12.34233570098877, "incorrect_loss_uncond": -9.710932413736979}, "model_output": [{"sum_logits": -12.606757164001465, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -24.949092864990234, "logits_per_token": -2.101126194000244, "logits_per_char": -0.46691693200005424, "num_chars": 27}, {"sum_logits": -21.29600715637207, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -30.56528663635254, "logits_per_token": -2.662000894546509, "logits_per_char": -0.5324001789093018, "num_chars": 40}, {"sum_logits": -14.707633972167969, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -24.174365997314453, "logits_per_token": -2.101090567452567, "logits_per_char": -0.5883053588867188, "num_chars": 25}, {"sum_logits": -30.096210479736328, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -40.49299621582031, "logits_per_token": -3.3440233866373696, "logits_per_char": -0.668804677327474, "num_chars": 45}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 764, "native_id": "Mercury_7033845", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.332643508911133, "incorrect_loss_raw": 20.094662030537922, "correct_loss_per_char": 0.21984347891300282, "incorrect_loss_per_char": 0.4617154120924225, "correct_loss_per_token": 1.4760919298444475, "incorrect_loss_per_token": 2.576625210898263, "correct_loss_uncond": -26.13454246520996, "incorrect_loss_uncond": -16.50446669260661}, "model_output": [{"sum_logits": -25.113292694091797, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -41.22376251220703, "logits_per_token": -3.1391615867614746, "logits_per_char": -0.5125161774304449, "num_chars": 49}, {"sum_logits": -10.332643508911133, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -36.467185974121094, "logits_per_token": -1.4760919298444475, "logits_per_char": -0.21984347891300282, "num_chars": 47}, {"sum_logits": -24.285560607910156, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -36.756866455078125, "logits_per_token": -3.0356950759887695, "logits_per_char": -0.6071390151977539, "num_chars": 40}, {"sum_logits": -10.885132789611816, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.816757202148438, "logits_per_token": -1.5550189699445451, "logits_per_char": -0.26549104364906867, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 765, "native_id": "Mercury_7221620", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.72113037109375, "incorrect_loss_raw": 16.656620343526203, "correct_loss_per_char": 0.2700313991970486, "incorrect_loss_per_char": 0.5916779112612081, "correct_loss_per_token": 1.6201883951822917, "incorrect_loss_per_token": 4.164155085881551, "correct_loss_uncond": -21.455368041992188, "incorrect_loss_uncond": -10.245864550272623}, "model_output": [{"sum_logits": -9.72113037109375, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -31.176498413085938, "logits_per_token": -1.6201883951822917, "logits_per_char": -0.2700313991970486, "num_chars": 36}, {"sum_logits": -21.02098274230957, "num_tokens": 4, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -27.172794342041016, "logits_per_token": -5.255245685577393, "logits_per_char": -0.6182641983032227, "num_chars": 34}, {"sum_logits": -15.423322677612305, "num_tokens": 4, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -28.642261505126953, "logits_per_token": -3.855830669403076, "logits_per_char": -0.593204718369704, "num_chars": 26}, {"sum_logits": -13.525555610656738, "num_tokens": 4, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -24.892398834228516, "logits_per_token": -3.3813889026641846, "logits_per_char": -0.5635648171106974, "num_chars": 24}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 766, "native_id": "LEAP__7_10352", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 35.544490814208984, "incorrect_loss_raw": 37.95756276448568, "correct_loss_per_char": 0.3905988001561427, "incorrect_loss_per_char": 0.5297291330190806, "correct_loss_per_token": 2.2215306758880615, "incorrect_loss_per_token": 2.726061162494478, "correct_loss_uncond": -32.5703010559082, "incorrect_loss_uncond": -22.86243438720703}, "model_output": [{"sum_logits": -36.8192138671875, "num_tokens": 12, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -57.32500457763672, "logits_per_token": -3.068267822265625, "logits_per_char": -0.5844319661458334, "num_chars": 63}, {"sum_logits": -32.93623352050781, "num_tokens": 14, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -57.44153594970703, "logits_per_token": -2.352588108607701, "logits_per_char": -0.4391497802734375, "num_chars": 75}, {"sum_logits": -44.11724090576172, "num_tokens": 16, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -67.69345092773438, "logits_per_token": -2.7573275566101074, "logits_per_char": -0.5656056526379708, "num_chars": 78}, {"sum_logits": -35.544490814208984, "num_tokens": 16, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -68.11479187011719, "logits_per_token": -2.2215306758880615, "logits_per_char": -0.3905988001561427, "num_chars": 91}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 767, "native_id": "Mercury_412605", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.424403190612793, "incorrect_loss_raw": 23.71111996968587, "correct_loss_per_char": 0.3505546179684726, "incorrect_loss_per_char": 0.5350151601558765, "correct_loss_per_token": 1.9280503988265991, "incorrect_loss_per_token": 3.0502950703656233, "correct_loss_uncond": -17.961522102355957, "incorrect_loss_uncond": -11.985565821329752}, "model_output": [{"sum_logits": -15.986045837402344, "num_tokens": 6, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -29.279083251953125, "logits_per_token": -2.6643409729003906, "logits_per_char": -0.48442563143643463, "num_chars": 33}, {"sum_logits": -25.852672576904297, "num_tokens": 8, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -37.307830810546875, "logits_per_token": -3.231584072113037, "logits_per_char": -0.6155398232596261, "num_chars": 42}, {"sum_logits": -15.424403190612793, "num_tokens": 8, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -33.38592529296875, "logits_per_token": -1.9280503988265991, "logits_per_char": -0.3505546179684726, "num_chars": 44}, {"sum_logits": -29.294641494750977, "num_tokens": 9, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -40.503143310546875, "logits_per_token": -3.2549601660834417, "logits_per_char": -0.5050800257715685, "num_chars": 58}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 768, "native_id": "Mercury_416638", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.39221954345703, "incorrect_loss_raw": 13.237539609273275, "correct_loss_per_char": 0.683009147644043, "incorrect_loss_per_char": 0.7385103658393577, "correct_loss_per_token": 4.098054885864258, "incorrect_loss_per_token": 3.589822048611111, "correct_loss_uncond": -9.981220245361328, "incorrect_loss_uncond": -10.569880485534668}, "model_output": [{"sum_logits": -16.39221954345703, "num_tokens": 4, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -26.37343978881836, "logits_per_token": -4.098054885864258, "logits_per_char": -0.683009147644043, "num_chars": 24}, {"sum_logits": -10.251309394836426, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -25.705623626708984, "logits_per_token": -2.050261878967285, "logits_per_char": -0.46596860885620117, "num_chars": 22}, {"sum_logits": -16.246522903442383, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -21.85455322265625, "logits_per_token": -5.415507634480794, "logits_per_char": -1.015407681465149, "num_chars": 16}, {"sum_logits": -13.214786529541016, "num_tokens": 4, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -23.862083435058594, "logits_per_token": -3.303696632385254, "logits_per_char": -0.7341548071967231, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 769, "native_id": "MCAS_2011_8_17694", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.8455228805542, "incorrect_loss_raw": 19.45002619425456, "correct_loss_per_char": 0.37113807201385496, "incorrect_loss_per_char": 0.4619617164511883, "correct_loss_per_token": 2.9691045761108397, "incorrect_loss_per_token": 3.9722342067294645, "correct_loss_uncond": -13.44184398651123, "incorrect_loss_uncond": -12.354856491088867}, "model_output": [{"sum_logits": -14.8455228805542, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -28.28736686706543, "logits_per_token": -2.9691045761108397, "logits_per_char": -0.37113807201385496, "num_chars": 40}, {"sum_logits": -19.559419631958008, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.995014190673828, "logits_per_token": -4.889854907989502, "logits_per_char": -0.47705901541360995, "num_chars": 41}, {"sum_logits": -16.852136611938477, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -32.855464935302734, "logits_per_token": -3.3704273223876955, "logits_per_char": -0.42130341529846194, "num_chars": 40}, {"sum_logits": -21.938522338867188, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -31.56416893005371, "logits_per_token": -3.6564203898111978, "logits_per_char": -0.48752271864149305, "num_chars": 45}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 770, "native_id": "Mercury_SC_400012", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.742847442626953, "incorrect_loss_raw": 20.714192708333332, "correct_loss_per_char": 0.40836213274699884, "incorrect_loss_per_char": 0.6011751538231259, "correct_loss_per_token": 1.8603163825141058, "incorrect_loss_per_token": 2.7377650548541355, "correct_loss_uncond": -26.973377227783203, "incorrect_loss_uncond": -22.301158905029297}, "model_output": [{"sum_logits": -22.525604248046875, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -41.529170989990234, "logits_per_token": -3.2179434640066966, "logits_per_char": -0.7039251327514648, "num_chars": 32}, {"sum_logits": -18.694169998168945, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -41.07601547241211, "logits_per_token": -2.670595714024135, "logits_per_char": -0.534119142804827, "num_chars": 35}, {"sum_logits": -20.92280387878418, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -46.44086837768555, "logits_per_token": -2.3247559865315757, "logits_per_char": -0.5654811859130859, "num_chars": 37}, {"sum_logits": -16.742847442626953, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -43.716224670410156, "logits_per_token": -1.8603163825141058, "logits_per_char": -0.40836213274699884, "num_chars": 41}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 771, "native_id": "Mercury_SC_413458", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.389385938644409, "incorrect_loss_raw": 6.9103624025980634, "correct_loss_per_char": 0.17838873361286364, "incorrect_loss_per_char": 0.5825016180674235, "correct_loss_per_token": 1.1297953128814697, "incorrect_loss_per_token": 2.473288827472263, "correct_loss_uncond": -13.826220750808716, "incorrect_loss_uncond": -10.094255606333414}, "model_output": [{"sum_logits": -7.401090621948242, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.024757385253906, "logits_per_token": -2.4670302073160806, "logits_per_char": -0.5286493301391602, "num_chars": 14}, {"sum_logits": -3.389385938644409, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": true, "sum_logits_uncond": -17.215606689453125, "logits_per_token": -1.1297953128814697, "logits_per_char": -0.17838873361286364, "num_chars": 19}, {"sum_logits": -6.481348514556885, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.796025276184082, "logits_per_token": -3.2406742572784424, "logits_per_char": -0.6481348514556885, "num_chars": 10}, {"sum_logits": -6.8486480712890625, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -18.193071365356445, "logits_per_token": -1.7121620178222656, "logits_per_char": -0.5707206726074219, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 772, "native_id": "Mercury_7139545", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.996232032775879, "incorrect_loss_raw": 11.238189538319906, "correct_loss_per_char": 0.36822273856715154, "incorrect_loss_per_char": 0.6495237705531686, "correct_loss_per_token": 3.4981160163879395, "incorrect_loss_per_token": 4.983162800470988, "correct_loss_uncond": -11.164916038513184, "incorrect_loss_uncond": -7.9646735191345215}, "model_output": [{"sum_logits": -11.446775436401367, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -17.648284912109375, "logits_per_token": -3.815591812133789, "logits_per_char": -0.7154234647750854, "num_chars": 16}, {"sum_logits": -15.420324325561523, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -19.361614227294922, "logits_per_token": -7.710162162780762, "logits_per_char": -0.907077901503619, "num_chars": 17}, {"sum_logits": -6.996232032775879, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.161148071289062, "logits_per_token": -3.4981160163879395, "logits_per_char": -0.36822273856715154, "num_chars": 19}, {"sum_logits": -6.847468852996826, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -20.598690032958984, "logits_per_token": -3.423734426498413, "logits_per_char": -0.32606994538080125, "num_chars": 21}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 773, "native_id": "NYSEDREGENTS_2015_4_5", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.84832763671875, "incorrect_loss_raw": 11.476213137308756, "correct_loss_per_char": 0.3498479669744318, "incorrect_loss_per_char": 0.9587597724718925, "correct_loss_per_token": 3.84832763671875, "incorrect_loss_per_token": 11.476213137308756, "correct_loss_uncond": -10.564767837524414, "incorrect_loss_uncond": -3.2785170873006186}, "model_output": [{"sum_logits": -11.861531257629395, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.821380615234375, "logits_per_token": -11.861531257629395, "logits_per_char": -0.9124254813561072, "num_chars": 13}, {"sum_logits": -3.84832763671875, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.413095474243164, "logits_per_token": -3.84832763671875, "logits_per_char": -0.3498479669744318, "num_chars": 11}, {"sum_logits": -12.231059074401855, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.029659271240234, "logits_per_token": -12.231059074401855, "logits_per_char": -0.815403938293457, "num_chars": 15}, {"sum_logits": -10.33604907989502, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -14.413150787353516, "logits_per_token": -10.33604907989502, "logits_per_char": -1.1484498977661133, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 774, "native_id": "TIMSS_2003_8_pg16", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.667613983154297, "incorrect_loss_raw": 12.999688466389975, "correct_loss_per_char": 0.2539908091227214, "incorrect_loss_per_char": 0.5205785369873047, "correct_loss_per_token": 1.5239448547363281, "incorrect_loss_per_token": 2.8128695942106705, "correct_loss_uncond": -25.801315307617188, "incorrect_loss_uncond": -16.68300501505534}, "model_output": [{"sum_logits": -14.099658966064453, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -21.511089324951172, "logits_per_token": -3.5249147415161133, "logits_per_char": -0.7833143870035807, "num_chars": 18}, {"sum_logits": -12.661935806274414, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.93710708618164, "logits_per_token": -3.1654839515686035, "logits_per_char": -0.5064774322509765, "num_chars": 25}, {"sum_logits": -10.667613983154297, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.468929290771484, "logits_per_token": -1.5239448547363281, "logits_per_char": -0.2539908091227214, "num_chars": 42}, {"sum_logits": -12.237470626831055, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.599884033203125, "logits_per_token": -1.7482100895472936, "logits_per_char": -0.2719437917073568, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 775, "native_id": "Mercury_SC_415073", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.095319747924805, "incorrect_loss_raw": 6.623545328776042, "correct_loss_per_char": 1.1825532913208008, "incorrect_loss_per_char": 1.1863690429263645, "correct_loss_per_token": 7.095319747924805, "incorrect_loss_per_token": 5.188127358754476, "correct_loss_uncond": -6.836706161499023, "incorrect_loss_uncond": -5.522496859232585}, "model_output": [{"sum_logits": -7.095319747924805, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.932025909423828, "logits_per_token": -7.095319747924805, "logits_per_char": -1.1825532913208008, "num_chars": 6}, {"sum_logits": -3.8380942344665527, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -3.8380942344665527, "logits_per_char": -0.6396823724110922, "num_chars": 6}, {"sum_logits": -8.612507820129395, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -12.770979881286621, "logits_per_token": -4.306253910064697, "logits_per_char": -1.4354179700215657, "num_chars": 6}, {"sum_logits": -7.420033931732178, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.778152465820312, "logits_per_token": -7.420033931732178, "logits_per_char": -1.4840067863464355, "num_chars": 5}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 776, "native_id": "Mercury_7012880", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.366202354431152, "incorrect_loss_raw": 10.565862655639648, "correct_loss_per_char": 0.2981223530239529, "incorrect_loss_per_char": 0.41963722604829234, "correct_loss_per_token": 1.7887341181437175, "incorrect_loss_per_token": 2.0789804155864413, "correct_loss_uncond": -16.21382999420166, "incorrect_loss_uncond": -17.295690536499023}, "model_output": [{"sum_logits": -5.366202354431152, "num_tokens": 3, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -21.580032348632812, "logits_per_token": -1.7887341181437175, "logits_per_char": -0.2981223530239529, "num_chars": 18}, {"sum_logits": -7.929636001586914, "num_tokens": 3, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -24.112396240234375, "logits_per_token": -2.643212000528971, "logits_per_char": -0.4405353334214952, "num_chars": 18}, {"sum_logits": -8.328916549682617, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -30.985673904418945, "logits_per_token": -1.3881527582804363, "logits_per_char": -0.3203429442185622, "num_chars": 26}, {"sum_logits": -15.439035415649414, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -28.486589431762695, "logits_per_token": -2.2055764879499162, "logits_per_char": -0.4980334005048198, "num_chars": 31}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 777, "native_id": "Mercury_191625", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.625221252441406, "incorrect_loss_raw": 15.219974199930826, "correct_loss_per_char": 0.644538164138794, "incorrect_loss_per_char": 0.5185675020382224, "correct_loss_per_token": 4.125044250488282, "incorrect_loss_per_token": 3.7697362475925025, "correct_loss_uncond": -8.504180908203125, "incorrect_loss_uncond": -12.091496467590332}, "model_output": [{"sum_logits": -14.194518089294434, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.361164093017578, "logits_per_token": -2.838903617858887, "logits_per_char": -0.430136911796801, "num_chars": 33}, {"sum_logits": -15.136222839355469, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.708454132080078, "logits_per_token": -3.0272445678710938, "logits_per_char": -0.44518302468692555, "num_chars": 34}, {"sum_logits": -16.329181671142578, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -23.86479377746582, "logits_per_token": -5.443060557047526, "logits_per_char": -0.6803825696309408, "num_chars": 24}, {"sum_logits": -20.625221252441406, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.12940216064453, "logits_per_token": -4.125044250488282, "logits_per_char": -0.644538164138794, "num_chars": 32}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 778, "native_id": "Mercury_SC_402985", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 6.654515743255615, "incorrect_loss_raw": 12.805057843526205, "correct_loss_per_char": 0.350237670697664, "incorrect_loss_per_char": 0.7544624368308703, "correct_loss_per_token": 3.3272578716278076, "incorrect_loss_per_token": 4.589098824395074, "correct_loss_uncond": -15.47029161453247, "incorrect_loss_uncond": -9.205958684285482}, "model_output": [{"sum_logits": -22.260265350341797, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.004623413085938, "logits_per_token": -7.420088450113933, "logits_per_char": -1.171592913175884, "num_chars": 19}, {"sum_logits": -6.654515743255615, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -22.124807357788086, "logits_per_token": -3.3272578716278076, "logits_per_char": -0.350237670697664, "num_chars": 19}, {"sum_logits": -10.381476402282715, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.550289154052734, "logits_per_token": -3.4604921340942383, "logits_per_char": -0.6106750824872185, "num_chars": 17}, {"sum_logits": -5.773431777954102, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -15.478137016296387, "logits_per_token": -2.886715888977051, "logits_per_char": -0.4811193148295085, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 779, "native_id": "Mercury_7005425", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.667409896850586, "incorrect_loss_raw": 17.799585342407227, "correct_loss_per_char": 0.5303368134932085, "incorrect_loss_per_char": 0.6204014923275308, "correct_loss_per_token": 1.9445683161417644, "incorrect_loss_per_token": 2.810582675631084, "correct_loss_uncond": -20.037981033325195, "incorrect_loss_uncond": -12.87929598490397}, "model_output": [{"sum_logits": -11.667409896850586, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -31.70539093017578, "logits_per_token": -1.9445683161417644, "logits_per_char": -0.5303368134932085, "num_chars": 22}, {"sum_logits": -19.657875061035156, "num_tokens": 7, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -33.53404235839844, "logits_per_token": -2.8082678658621654, "logits_per_char": -0.6778577607253502, "num_chars": 29}, {"sum_logits": -15.860620498657227, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -29.939332962036133, "logits_per_token": -2.6434367497762046, "logits_per_char": -0.49564439058303833, "num_chars": 32}, {"sum_logits": -17.880260467529297, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -28.563268661499023, "logits_per_token": -2.980043411254883, "logits_per_char": -0.6877023256742038, "num_chars": 26}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 780, "native_id": "MDSA_2013_8_40", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.75930404663086, "incorrect_loss_raw": 26.760350545247395, "correct_loss_per_char": 0.5951860809326172, "incorrect_loss_per_char": 0.5150009596727574, "correct_loss_per_token": 3.3065893385145397, "incorrect_loss_per_token": 2.973372282805266, "correct_loss_uncond": -12.921024322509766, "incorrect_loss_uncond": -15.287919362386068}, "model_output": [{"sum_logits": -20.570919036865234, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -36.58567428588867, "logits_per_token": -2.285657670762804, "logits_per_char": -0.4675208872014826, "num_chars": 44}, {"sum_logits": -29.156320571899414, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -42.40117645263672, "logits_per_token": -3.2395911746554904, "logits_per_char": -0.6074233452479044, "num_chars": 48}, {"sum_logits": -29.75930404663086, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -42.680328369140625, "logits_per_token": -3.3065893385145397, "logits_per_char": -0.5951860809326172, "num_chars": 50}, {"sum_logits": -30.55381202697754, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -47.157958984375, "logits_per_token": -3.3948680029975042, "logits_per_char": -0.47005864656888524, "num_chars": 65}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 781, "native_id": "Mercury_401684", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.189501762390137, "incorrect_loss_raw": 13.876946449279785, "correct_loss_per_char": 0.6582059860229492, "incorrect_loss_per_char": 0.8121643472600866, "correct_loss_per_token": 3.7298339207967124, "incorrect_loss_per_token": 4.258248779508803, "correct_loss_uncond": -9.818366050720215, "incorrect_loss_uncond": -8.093358675638834}, "model_output": [{"sum_logits": -12.420364379882812, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.89102554321289, "logits_per_token": -4.1401214599609375, "logits_per_char": -0.8871688842773438, "num_chars": 14}, {"sum_logits": -11.189501762390137, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -21.00786781311035, "logits_per_token": -3.7298339207967124, "logits_per_char": -0.6582059860229492, "num_chars": 17}, {"sum_logits": -13.226401329040527, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -23.748245239257812, "logits_per_token": -3.306600332260132, "logits_per_char": -0.6613200664520263, "num_chars": 20}, {"sum_logits": -15.984073638916016, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.271644592285156, "logits_per_token": -5.328024546305339, "logits_per_char": -0.8880040910508897, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 782, "native_id": "NCEOGA_2013_5_17", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.466350555419922, "incorrect_loss_raw": 7.546921094258626, "correct_loss_per_char": 0.49626117282443577, "incorrect_loss_per_char": 1.2672759665383233, "correct_loss_per_token": 4.466350555419922, "incorrect_loss_per_token": 7.546921094258626, "correct_loss_uncond": -7.154205322265625, "incorrect_loss_uncond": -4.009065945943196}, "model_output": [{"sum_logits": -8.35289478302002, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -8.35289478302002, "logits_per_char": -1.3921491305033367, "num_chars": 6}, {"sum_logits": -4.466350555419922, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -11.620555877685547, "logits_per_token": -4.466350555419922, "logits_per_char": -0.49626117282443577, "num_chars": 9}, {"sum_logits": -8.315936088562012, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.072425842285156, "logits_per_token": -8.315936088562012, "logits_per_char": -1.6631872177124023, "num_chars": 5}, {"sum_logits": -5.971932411193848, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -12.706541061401367, "logits_per_token": -5.971932411193848, "logits_per_char": -0.746491551399231, "num_chars": 8}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 783, "native_id": "Mercury_7116183", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.878995895385742, "incorrect_loss_raw": 17.50781758626302, "correct_loss_per_char": 0.3493822322172277, "incorrect_loss_per_char": 0.48050418738104844, "correct_loss_per_token": 2.3757991790771484, "incorrect_loss_per_token": 3.0737092759874134, "correct_loss_uncond": -14.836523056030273, "incorrect_loss_uncond": -12.785989125569662}, "model_output": [{"sum_logits": -14.016571044921875, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -27.688053131103516, "logits_per_token": -2.803314208984375, "logits_per_char": -0.41225208955652576, "num_chars": 34}, {"sum_logits": -11.878995895385742, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -26.715518951416016, "logits_per_token": -2.3757991790771484, "logits_per_char": -0.3493822322172277, "num_chars": 34}, {"sum_logits": -22.385601043701172, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -32.21165466308594, "logits_per_token": -3.730933507283529, "logits_per_char": -0.605016244424356, "num_chars": 37}, {"sum_logits": -16.121280670166016, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -30.981712341308594, "logits_per_token": -2.686880111694336, "logits_per_char": -0.4242442281622636, "num_chars": 38}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 784, "native_id": "Mercury_7106628", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.988451957702637, "incorrect_loss_raw": 17.022090911865234, "correct_loss_per_char": 0.5180908132482458, "incorrect_loss_per_char": 0.5820845405260722, "correct_loss_per_token": 2.7976903915405273, "incorrect_loss_per_token": 4.6234887970818415, "correct_loss_uncond": -15.108201026916504, "incorrect_loss_uncond": -11.087871551513672}, "model_output": [{"sum_logits": -20.97353172302246, "num_tokens": 4, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.14881134033203, "logits_per_token": -5.243382930755615, "logits_per_char": -0.6554228663444519, "num_chars": 32}, {"sum_logits": -10.528726577758789, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -24.858856201171875, "logits_per_token": -2.105745315551758, "logits_per_char": -0.4386969407399495, "num_chars": 24}, {"sum_logits": -19.564014434814453, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.322219848632812, "logits_per_token": -6.521338144938151, "logits_per_char": -0.6521338144938151, "num_chars": 30}, {"sum_logits": -13.988451957702637, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -29.09665298461914, "logits_per_token": -2.7976903915405273, "logits_per_char": -0.5180908132482458, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 785, "native_id": "Mercury_7203473", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 20.44823455810547, "incorrect_loss_raw": 23.014474232991535, "correct_loss_per_char": 0.33521695996894213, "incorrect_loss_per_char": 0.49123332190816793, "correct_loss_per_token": 1.8589304143732244, "incorrect_loss_per_token": 2.646844740267153, "correct_loss_uncond": -15.429920196533203, "incorrect_loss_uncond": -16.752615610758465}, "model_output": [{"sum_logits": -19.371082305908203, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -36.595680236816406, "logits_per_token": -2.4213852882385254, "logits_per_char": -0.47246542209532205, "num_chars": 41}, {"sum_logits": -28.7022705078125, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -40.080352783203125, "logits_per_token": -3.1891411675347223, "logits_per_char": -0.5979639689127604, "num_chars": 48}, {"sum_logits": -20.970069885253906, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -42.62523651123047, "logits_per_token": -2.3300077650282116, "logits_per_char": -0.40327057471642125, "num_chars": 52}, {"sum_logits": -20.44823455810547, "num_tokens": 11, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -35.87815475463867, "logits_per_token": -1.8589304143732244, "logits_per_char": -0.33521695996894213, "num_chars": 61}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 786, "native_id": "Mercury_SC_416108", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.730001449584961, "incorrect_loss_raw": 7.123436292012532, "correct_loss_per_char": 0.51210533945184, "incorrect_loss_per_char": 0.3634387939601596, "correct_loss_per_token": 3.2433338165283203, "incorrect_loss_per_token": 2.3744787640041776, "correct_loss_uncond": -11.257568359375, "incorrect_loss_uncond": -13.744043032328287}, "model_output": [{"sum_logits": -5.2537407875061035, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -19.44900131225586, "logits_per_token": -1.7512469291687012, "logits_per_char": -0.276512673026637, "num_chars": 19}, {"sum_logits": -4.136566638946533, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -19.311609268188477, "logits_per_token": -1.378855546315511, "logits_per_char": -0.24332744934979608, "num_chars": 17}, {"sum_logits": -9.730001449584961, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -20.98756980895996, "logits_per_token": -3.2433338165283203, "logits_per_char": -0.51210533945184, "num_chars": 19}, {"sum_logits": -11.980001449584961, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -23.841827392578125, "logits_per_token": -3.9933338165283203, "logits_per_char": -0.5704762595040458, "num_chars": 21}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 787, "native_id": "LEAP_2007_8_10418", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.313590049743652, "incorrect_loss_raw": 19.019766489664715, "correct_loss_per_char": 0.34137410383958083, "incorrect_loss_per_char": 0.45996376988749027, "correct_loss_per_token": 1.9019414356776647, "incorrect_loss_per_token": 2.7733666594066317, "correct_loss_uncond": -11.5051908493042, "incorrect_loss_uncond": -14.071358998616537}, "model_output": [{"sum_logits": -15.261007308959961, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -30.320396423339844, "logits_per_token": -2.1801439012799944, "logits_per_char": -0.3391334957546658, "num_chars": 45}, {"sum_logits": -21.964069366455078, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -34.66569519042969, "logits_per_token": -3.660678227742513, "logits_per_char": -0.6275448390415737, "num_chars": 35}, {"sum_logits": -13.313590049743652, "num_tokens": 7, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -24.81878089904785, "logits_per_token": -1.9019414356776647, "logits_per_char": -0.34137410383958083, "num_chars": 39}, {"sum_logits": -19.8342227935791, "num_tokens": 8, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -34.28728485107422, "logits_per_token": -2.4792778491973877, "logits_per_char": -0.41321297486623126, "num_chars": 48}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 788, "native_id": "Mercury_7111178", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 38.1442985534668, "incorrect_loss_raw": 34.06533686319987, "correct_loss_per_char": 0.6253163697289639, "incorrect_loss_per_char": 0.5919099654300689, "correct_loss_per_token": 2.724592753819057, "incorrect_loss_per_token": 2.8553644564562224, "correct_loss_uncond": -15.533802032470703, "incorrect_loss_uncond": -17.22742462158203}, "model_output": [{"sum_logits": -37.64698791503906, "num_tokens": 10, "num_tokens_all": 253, "is_greedy": false, "sum_logits_uncond": -50.76150894165039, "logits_per_token": -3.764698791503906, "logits_per_char": -0.7103205266988503, "num_chars": 53}, {"sum_logits": -29.832504272460938, "num_tokens": 14, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -50.60155487060547, "logits_per_token": -2.1308931623186385, "logits_per_char": -0.5143535219389817, "num_chars": 58}, {"sum_logits": -38.1442985534668, "num_tokens": 14, "num_tokens_all": 257, "is_greedy": false, "sum_logits_uncond": -53.6781005859375, "logits_per_token": -2.724592753819057, "logits_per_char": -0.6253163697289639, "num_chars": 61}, {"sum_logits": -34.71651840209961, "num_tokens": 13, "num_tokens_all": 256, "is_greedy": false, "sum_logits_uncond": -52.515220642089844, "logits_per_token": -2.670501415546124, "logits_per_char": -0.5510558476523747, "num_chars": 63}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 789, "native_id": "Mercury_7203560", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.395408630371094, "incorrect_loss_raw": 23.743650436401367, "correct_loss_per_char": 0.5190512474547041, "incorrect_loss_per_char": 0.5089749302526917, "correct_loss_per_token": 2.0329507191975913, "incorrect_loss_per_token": 2.5726525574822214, "correct_loss_uncond": -25.60169219970703, "incorrect_loss_uncond": -19.00463803609212}, "model_output": [{"sum_logits": -24.395408630371094, "num_tokens": 12, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -49.997100830078125, "logits_per_token": -2.0329507191975913, "logits_per_char": -0.5190512474547041, "num_chars": 47}, {"sum_logits": -22.675825119018555, "num_tokens": 8, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -42.58758544921875, "logits_per_token": -2.8344781398773193, "logits_per_char": -0.5153596617958762, "num_chars": 44}, {"sum_logits": -25.32095718383789, "num_tokens": 11, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -44.848812103271484, "logits_per_token": -2.3019051985307173, "logits_per_char": -0.527519941329956, "num_chars": 48}, {"sum_logits": -23.234169006347656, "num_tokens": 9, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -40.808467864990234, "logits_per_token": -2.5815743340386286, "logits_per_char": -0.48404518763224286, "num_chars": 48}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 790, "native_id": "ACTAAP_2013_7_2", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.12127113342285, "incorrect_loss_raw": 24.81254514058431, "correct_loss_per_char": 0.3294776569713246, "incorrect_loss_per_char": 0.4407496778986906, "correct_loss_per_token": 1.3939439333402193, "incorrect_loss_per_token": 2.1324310969639493, "correct_loss_uncond": -14.537927627563477, "incorrect_loss_uncond": -16.0034974416097}, "model_output": [{"sum_logits": -18.12127113342285, "num_tokens": 13, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -32.65919876098633, "logits_per_token": -1.3939439333402193, "logits_per_char": -0.3294776569713246, "num_chars": 55}, {"sum_logits": -25.673171997070312, "num_tokens": 11, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -42.28661346435547, "logits_per_token": -2.333924727006392, "logits_per_char": -0.45844949994768414, "num_chars": 56}, {"sum_logits": -26.438159942626953, "num_tokens": 13, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -42.6965217590332, "logits_per_token": -2.033704610971304, "logits_per_char": -0.4721099989754813, "num_chars": 56}, {"sum_logits": -22.326303482055664, "num_tokens": 11, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -37.46499252319336, "logits_per_token": -2.0296639529141514, "logits_per_char": -0.3916895347729064, "num_chars": 57}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 791, "native_id": "MCAS_2012_8_23640", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 1.9550938606262207, "incorrect_loss_raw": 2.7095173597335815, "correct_loss_per_char": 0.19550938606262208, "incorrect_loss_per_char": 0.23770507778784244, "correct_loss_per_token": 1.9550938606262207, "incorrect_loss_per_token": 2.7095173597335815, "correct_loss_uncond": -11.057742595672607, "incorrect_loss_uncond": -12.1911491950353}, "model_output": [{"sum_logits": -1.8831013441085815, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": true, "sum_logits_uncond": -16.4089298248291, "logits_per_token": -1.8831013441085815, "logits_per_char": -0.17119103128259833, "num_chars": 11}, {"sum_logits": -2.8340365886688232, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.035669326782227, "logits_per_token": -2.8340365886688232, "logits_per_char": -0.25763968987898395, "num_chars": 11}, {"sum_logits": -3.41141414642334, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.257400512695312, "logits_per_token": -3.41141414642334, "logits_per_char": -0.284284512201945, "num_chars": 12}, {"sum_logits": -1.9550938606262207, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.012836456298828, "logits_per_token": -1.9550938606262207, "logits_per_char": -0.19550938606262208, "num_chars": 10}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 792, "native_id": "Mercury_404272", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.14515686035156, "incorrect_loss_raw": 20.097524007161457, "correct_loss_per_char": 0.4945408747746394, "incorrect_loss_per_char": 0.3400467075817138, "correct_loss_per_token": 2.6787630716959634, "incorrect_loss_per_token": 1.6915953060937305, "correct_loss_uncond": -11.086135864257812, "incorrect_loss_uncond": -11.090136845906576}, "model_output": [{"sum_logits": -17.599031448364258, "num_tokens": 14, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -26.340465545654297, "logits_per_token": -1.2570736748831612, "logits_per_char": -0.2550584267878878, "num_chars": 69}, {"sum_logits": -32.14515686035156, "num_tokens": 12, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -43.231292724609375, "logits_per_token": -2.6787630716959634, "logits_per_char": -0.4945408747746394, "num_chars": 65}, {"sum_logits": -27.098508834838867, "num_tokens": 12, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -40.46141052246094, "logits_per_token": -2.2582090695699057, "logits_per_char": -0.45929675991252317, "num_chars": 59}, {"sum_logits": -15.59503173828125, "num_tokens": 10, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -26.761106491088867, "logits_per_token": -1.559503173828125, "logits_per_char": -0.3057849360447304, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 793, "native_id": "MCAS_2009_8_17", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 5.877846717834473, "incorrect_loss_raw": 8.208781878153482, "correct_loss_per_char": 0.3457556892843807, "incorrect_loss_per_char": 0.3792405227378563, "correct_loss_per_token": 1.9592822392781575, "incorrect_loss_per_token": 2.0521954695383706, "correct_loss_uncond": -16.815186500549316, "incorrect_loss_uncond": -10.70989958445231}, "model_output": [{"sum_logits": -9.81435489654541, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -20.388933181762695, "logits_per_token": -2.4535887241363525, "logits_per_char": -0.5452419386969672, "num_chars": 18}, {"sum_logits": -5.877846717834473, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -22.69303321838379, "logits_per_token": -1.9592822392781575, "logits_per_char": -0.3457556892843807, "num_chars": 17}, {"sum_logits": -11.790042877197266, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.687755584716797, "logits_per_token": -2.9475107192993164, "logits_per_char": -0.4716017150878906, "num_chars": 25}, {"sum_logits": -3.0219478607177734, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": true, "sum_logits_uncond": -16.67935562133789, "logits_per_token": -0.7554869651794434, "logits_per_char": -0.12087791442871093, "num_chars": 25}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 794, "native_id": "AIMS_2008_4_5", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.3964786529541, "incorrect_loss_raw": 13.3280242284139, "correct_loss_per_char": 0.4977974181479596, "incorrect_loss_per_char": 0.2910744040739591, "correct_loss_per_token": 3.8994131088256836, "incorrect_loss_per_token": 1.6730483726218894, "correct_loss_uncond": -11.668561935424805, "incorrect_loss_uncond": -16.47235902150472}, "model_output": [{"sum_logits": -9.89013957977295, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -27.03151512145996, "logits_per_token": -1.412877082824707, "logits_per_char": -0.2247758995402943, "num_chars": 44}, {"sum_logits": -18.89983367919922, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -30.569705963134766, "logits_per_token": -2.3624792098999023, "logits_per_char": -0.4199963039822049, "num_chars": 45}, {"sum_logits": -23.3964786529541, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -35.065040588378906, "logits_per_token": -3.8994131088256836, "logits_per_char": -0.4977974181479596, "num_chars": 47}, {"sum_logits": -11.194099426269531, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.799928665161133, "logits_per_token": -1.243788825141059, "logits_per_char": -0.22845100869937818, "num_chars": 49}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 795, "native_id": "Mercury_7236513", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.555027008056641, "incorrect_loss_raw": 7.364046414693196, "correct_loss_per_char": 0.7283363342285156, "incorrect_loss_per_char": 0.6588459213574728, "correct_loss_per_token": 6.555027008056641, "incorrect_loss_per_token": 4.533651510874431, "correct_loss_uncond": -5.787111282348633, "incorrect_loss_uncond": -9.422858874003092}, "model_output": [{"sum_logits": -8.545143127441406, "num_tokens": 2, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -19.55876922607422, "logits_per_token": -4.272571563720703, "logits_per_char": -0.4272571563720703, "num_chars": 20}, {"sum_logits": -8.437226295471191, "num_tokens": 2, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -18.809370040893555, "logits_per_token": -4.218613147735596, "logits_per_char": -0.5273266434669495, "num_chars": 16}, {"sum_logits": -5.109769821166992, "num_tokens": 1, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -11.992576599121094, "logits_per_token": -5.109769821166992, "logits_per_char": -1.0219539642333983, "num_chars": 5}, {"sum_logits": -6.555027008056641, "num_tokens": 1, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -12.342138290405273, "logits_per_token": -6.555027008056641, "logits_per_char": -0.7283363342285156, "num_chars": 9}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 796, "native_id": "Mercury_SC_LBS10027", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.673665046691895, "incorrect_loss_raw": 17.11328188578288, "correct_loss_per_char": 0.27551445753678033, "incorrect_loss_per_char": 0.344324745214789, "correct_loss_per_token": 1.2673665046691895, "incorrect_loss_per_token": 1.7113281885782878, "correct_loss_uncond": -17.756916999816895, "incorrect_loss_uncond": -12.947301864624023}, "model_output": [{"sum_logits": -14.403627395629883, "num_tokens": 10, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -28.861099243164062, "logits_per_token": -1.4403627395629883, "logits_per_char": -0.3273551680824973, "num_chars": 44}, {"sum_logits": -12.673665046691895, "num_tokens": 10, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.43058204650879, "logits_per_token": -1.2673665046691895, "logits_per_char": -0.27551445753678033, "num_chars": 46}, {"sum_logits": -18.04914665222168, "num_tokens": 10, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.86637306213379, "logits_per_token": -1.804914665222168, "logits_per_char": -0.36834993167799346, "num_chars": 49}, {"sum_logits": -18.88707160949707, "num_tokens": 10, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -30.45427894592285, "logits_per_token": -1.888707160949707, "logits_per_char": -0.33726913588387625, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 797, "native_id": "Mercury_189053", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 22.43267059326172, "incorrect_loss_raw": 16.530142148335774, "correct_loss_per_char": 0.4578096039441167, "incorrect_loss_per_char": 0.43745279406125964, "correct_loss_per_token": 1.8693892161051433, "incorrect_loss_per_token": 2.004601676054675, "correct_loss_uncond": -8.50088119506836, "incorrect_loss_uncond": -7.170863787333171}, "model_output": [{"sum_logits": -11.250710487365723, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.030994415283203, "logits_per_token": -1.8751184145609539, "logits_per_char": -0.4166929810135453, "num_chars": 27}, {"sum_logits": -12.575214385986328, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -20.575708389282227, "logits_per_token": -1.7964591979980469, "logits_per_char": -0.3698592466466567, "num_chars": 34}, {"sum_logits": -25.764501571655273, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.496315002441406, "logits_per_token": -2.342227415605025, "logits_per_char": -0.525806154523577, "num_chars": 49}, {"sum_logits": -22.43267059326172, "num_tokens": 12, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -30.933551788330078, "logits_per_token": -1.8693892161051433, "logits_per_char": -0.4578096039441167, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 798, "native_id": "Mercury_SC_414271", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.637901306152344, "incorrect_loss_raw": 21.65706507364909, "correct_loss_per_char": 0.5521439342963986, "incorrect_loss_per_char": 0.5237348654969335, "correct_loss_per_token": 2.2637901306152344, "incorrect_loss_per_token": 2.496896814416956, "correct_loss_uncond": -16.666362762451172, "incorrect_loss_uncond": -16.93517303466797}, "model_output": [{"sum_logits": -19.560150146484375, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -36.35298156738281, "logits_per_token": -2.445018768310547, "logits_per_char": -0.48900375366210935, "num_chars": 40}, {"sum_logits": -23.033618927001953, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -40.33317947387695, "logits_per_token": -2.559290991889106, "logits_per_char": -0.5617955835854135, "num_chars": 41}, {"sum_logits": -22.637901306152344, "num_tokens": 10, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -39.304264068603516, "logits_per_token": -2.2637901306152344, "logits_per_char": -0.5521439342963986, "num_chars": 41}, {"sum_logits": -22.377426147460938, "num_tokens": 9, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -39.090553283691406, "logits_per_token": -2.486380683051215, "logits_per_char": -0.5204052592432776, "num_chars": 43}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 799, "native_id": "Mercury_408922", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 33.55693817138672, "incorrect_loss_raw": 32.6846071879069, "correct_loss_per_char": 0.5592823028564453, "incorrect_loss_per_char": 0.5620241153263296, "correct_loss_per_token": 3.050630742853338, "incorrect_loss_per_token": 3.4847199651930065, "correct_loss_uncond": -13.247840881347656, "incorrect_loss_uncond": -10.747571309407553}, "model_output": [{"sum_logits": -31.727771759033203, "num_tokens": 8, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -44.45886993408203, "logits_per_token": -3.9659714698791504, "logits_per_char": -0.6101494569044846, "num_chars": 52}, {"sum_logits": -34.59663391113281, "num_tokens": 9, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -42.02039337158203, "logits_per_token": -3.8440704345703125, "logits_per_char": -0.6290297074751421, "num_chars": 55}, {"sum_logits": -33.55693817138672, "num_tokens": 11, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -46.804779052734375, "logits_per_token": -3.050630742853338, "logits_per_char": -0.5592823028564453, "num_chars": 60}, {"sum_logits": -31.729415893554688, "num_tokens": 12, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -43.8172721862793, "logits_per_token": -2.644117991129557, "logits_per_char": -0.4468931815993618, "num_chars": 71}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 800, "native_id": "Mercury_7264093", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.289944648742676, "incorrect_loss_raw": 2.5980700254440308, "correct_loss_per_char": 0.7557063783918109, "incorrect_loss_per_char": 0.33839657508506976, "correct_loss_per_token": 5.289944648742676, "incorrect_loss_per_token": 2.5980700254440308, "correct_loss_uncond": -6.211440086364746, "incorrect_loss_uncond": -9.486960848172506}, "model_output": [{"sum_logits": -1.8422385454177856, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": true, "sum_logits_uncond": -11.02944564819336, "logits_per_token": -1.8422385454177856, "logits_per_char": -0.26317693505968365, "num_chars": 7}, {"sum_logits": -3.0954689979553223, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -11.72271728515625, "logits_per_token": -3.0954689979553223, "logits_per_char": -0.34394099977281356, "num_chars": 9}, {"sum_logits": -5.289944648742676, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -11.501384735107422, "logits_per_token": -5.289944648742676, "logits_per_char": -0.7557063783918109, "num_chars": 7}, {"sum_logits": -2.8565025329589844, "num_tokens": 1, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -13.5029296875, "logits_per_token": -2.8565025329589844, "logits_per_char": -0.4080717904227121, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 801, "native_id": "Mercury_SC_LBS11009", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.88563346862793, "incorrect_loss_raw": 8.93389368057251, "correct_loss_per_char": 0.821114498635997, "incorrect_loss_per_char": 0.375333187753359, "correct_loss_per_token": 3.777126693725586, "incorrect_loss_per_token": 2.0946163415908816, "correct_loss_uncond": -10.406450271606445, "incorrect_loss_uncond": -14.921951452891031}, "model_output": [{"sum_logits": -10.838456153869629, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -28.604782104492188, "logits_per_token": -2.7096140384674072, "logits_per_char": -0.416863698225755, "num_chars": 26}, {"sum_logits": -7.631800174713135, "num_tokens": 4, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.275569915771484, "logits_per_token": -1.9079500436782837, "logits_per_char": -0.3469000079415061, "num_chars": 22}, {"sum_logits": -18.88563346862793, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.292083740234375, "logits_per_token": -3.777126693725586, "logits_per_char": -0.821114498635997, "num_chars": 23}, {"sum_logits": -8.331424713134766, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -21.687183380126953, "logits_per_token": -1.666284942626953, "logits_per_char": -0.3622358570928159, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 802, "native_id": "Mercury_7191433", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.89063549041748, "incorrect_loss_raw": 14.391565958658854, "correct_loss_per_char": 0.4540181568690709, "incorrect_loss_per_char": 0.50601647357725, "correct_loss_per_token": 2.270090784345354, "incorrect_loss_per_token": 2.730776362948948, "correct_loss_uncond": -18.92747974395752, "incorrect_loss_uncond": -11.859118143717447}, "model_output": [{"sum_logits": -11.958553314208984, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -21.25379180908203, "logits_per_token": -2.989638328552246, "logits_per_char": -0.519937100617782, "num_chars": 23}, {"sum_logits": -10.740272521972656, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -23.992938995361328, "logits_per_token": -1.7900454203287761, "logits_per_char": -0.413087404691256, "num_chars": 26}, {"sum_logits": -15.89063549041748, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -34.818115234375, "logits_per_token": -2.270090784345354, "logits_per_char": -0.4540181568690709, "num_chars": 35}, {"sum_logits": -20.475872039794922, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.50532150268555, "logits_per_token": -3.4126453399658203, "logits_per_char": -0.5850249154227121, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 803, "native_id": "MEAP_2005_5_14", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.465557098388672, "incorrect_loss_raw": 24.383087793986004, "correct_loss_per_char": 0.8233934350915857, "incorrect_loss_per_char": 0.7731377513238972, "correct_loss_per_token": 3.808194637298584, "incorrect_loss_per_token": 3.1595093522753035, "correct_loss_uncond": -20.214584350585938, "incorrect_loss_uncond": -17.771615982055664}, "model_output": [{"sum_logits": -26.871490478515625, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -43.30961608886719, "logits_per_token": -3.358936309814453, "logits_per_char": -0.8957163492838541, "num_chars": 30}, {"sum_logits": -27.52504539489746, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -43.44009780883789, "logits_per_token": -3.4406306743621826, "logits_per_char": -0.8879046901579826, "num_chars": 31}, {"sum_logits": -18.752727508544922, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -39.71439743041992, "logits_per_token": -2.6789610726492747, "logits_per_char": -0.5357922145298549, "num_chars": 35}, {"sum_logits": -30.465557098388672, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -50.68014144897461, "logits_per_token": -3.808194637298584, "logits_per_char": -0.8233934350915857, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 804, "native_id": "Mercury_416683", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.598926544189453, "incorrect_loss_raw": 29.454790751139324, "correct_loss_per_char": 0.5106951168605259, "incorrect_loss_per_char": 0.5332892913481421, "correct_loss_per_token": 2.3832438786824546, "incorrect_loss_per_token": 2.945479075113932, "correct_loss_uncond": -15.243064880371094, "incorrect_loss_uncond": -16.773690541585285}, "model_output": [{"sum_logits": -28.874835968017578, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -46.10485076904297, "logits_per_token": -2.887483596801758, "logits_per_char": -0.5448082258116524, "num_chars": 53}, {"sum_logits": -31.23859405517578, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -51.291343688964844, "logits_per_token": -3.123859405517578, "logits_per_char": -0.5679744373668324, "num_chars": 55}, {"sum_logits": -28.598926544189453, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -43.84199142456055, "logits_per_token": -2.3832438786824546, "logits_per_char": -0.5106951168605259, "num_chars": 56}, {"sum_logits": -28.25094223022461, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -41.289249420166016, "logits_per_token": -2.825094223022461, "logits_per_char": -0.48708521086594153, "num_chars": 58}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 805, "native_id": "Mercury_7040775", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.3209517002105713, "incorrect_loss_raw": 13.873376528422037, "correct_loss_per_char": 0.19535010001238653, "incorrect_loss_per_char": 0.6985447405095685, "correct_loss_per_token": 1.1069839000701904, "incorrect_loss_per_token": 4.415769100189209, "correct_loss_uncond": -19.372081518173218, "incorrect_loss_uncond": -11.571457227071127}, "model_output": [{"sum_logits": -19.342018127441406, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -27.544456481933594, "logits_per_token": -6.447339375813802, "logits_per_char": -1.0180009540758634, "num_chars": 19}, {"sum_logits": -3.3209517002105713, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -22.69303321838379, "logits_per_token": -1.1069839000701904, "logits_per_char": -0.19535010001238653, "num_chars": 17}, {"sum_logits": -14.765280723571777, "num_tokens": 3, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -25.21633529663086, "logits_per_token": -4.921760241190593, "logits_per_char": -0.7771200380827251, "num_chars": 19}, {"sum_logits": -7.51283073425293, "num_tokens": 4, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -23.57370948791504, "logits_per_token": -1.8782076835632324, "logits_per_char": -0.3005132293701172, "num_chars": 25}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 806, "native_id": "Mercury_7222600", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.3544864654541, "incorrect_loss_raw": 16.6076602935791, "correct_loss_per_char": 0.530783783305775, "incorrect_loss_per_char": 0.5746122003394089, "correct_loss_per_token": 3.892414410909017, "incorrect_loss_per_token": 3.3170995500352647, "correct_loss_uncond": -11.69276237487793, "incorrect_loss_uncond": -12.261730829874674}, "model_output": [{"sum_logits": -8.273468017578125, "num_tokens": 4, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -20.796642303466797, "logits_per_token": -2.0683670043945312, "logits_per_char": -0.330938720703125, "num_chars": 25}, {"sum_logits": -28.740385055541992, "num_tokens": 5, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -35.87767028808594, "logits_per_token": -5.748077011108398, "logits_per_char": -1.0644587057608146, "num_chars": 27}, {"sum_logits": -23.3544864654541, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -35.04724884033203, "logits_per_token": -3.892414410909017, "logits_per_char": -0.530783783305775, "num_chars": 44}, {"sum_logits": -12.809127807617188, "num_tokens": 6, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -29.933860778808594, "logits_per_token": -2.1348546346028647, "logits_per_char": -0.32843917455428684, "num_chars": 39}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 807, "native_id": "MCAS_2001_5_3", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.168235778808594, "incorrect_loss_raw": 5.652180830637614, "correct_loss_per_char": 0.6168235778808594, "incorrect_loss_per_char": 0.6224328562744662, "correct_loss_per_token": 6.168235778808594, "incorrect_loss_per_token": 4.06864054997762, "correct_loss_uncond": -7.320783615112305, "incorrect_loss_uncond": -8.572125911712646}, "model_output": [{"sum_logits": -3.989546298980713, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -14.053781509399414, "logits_per_token": -3.989546298980713, "logits_per_char": -0.4432829221089681, "num_chars": 9}, {"sum_logits": -9.501241683959961, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -15.647985458374023, "logits_per_token": -4.7506208419799805, "logits_per_char": -0.730864744919997, "num_chars": 13}, {"sum_logits": -3.465754508972168, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.971153259277344, "logits_per_token": -3.465754508972168, "logits_per_char": -0.6931509017944336, "num_chars": 5}, {"sum_logits": -6.168235778808594, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.489019393920898, "logits_per_token": -6.168235778808594, "logits_per_char": -0.6168235778808594, "num_chars": 10}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 808, "native_id": "MCAS_2004_8_7", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 47.78968048095703, "incorrect_loss_raw": 38.62841288248698, "correct_loss_per_char": 0.673094091281085, "incorrect_loss_per_char": 0.6324006020007095, "correct_loss_per_token": 3.1859786987304686, "incorrect_loss_per_token": 3.341428019857814, "correct_loss_uncond": -20.979820251464844, "incorrect_loss_uncond": -21.49316914876302}, "model_output": [{"sum_logits": -37.50419998168945, "num_tokens": 10, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -61.592559814453125, "logits_per_token": -3.7504199981689452, "logits_per_char": -0.7212346150324895, "num_chars": 52}, {"sum_logits": -40.230709075927734, "num_tokens": 13, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -59.755496978759766, "logits_per_token": -3.094669928917518, "logits_per_char": -0.5293514352095755, "num_chars": 76}, {"sum_logits": -38.15032958984375, "num_tokens": 12, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -59.01668930053711, "logits_per_token": -3.179194132486979, "logits_per_char": -0.6466157557600636, "num_chars": 59}, {"sum_logits": -47.78968048095703, "num_tokens": 15, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -68.76950073242188, "logits_per_token": -3.1859786987304686, "logits_per_char": -0.673094091281085, "num_chars": 71}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 809, "native_id": "Mercury_415268", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.66452407836914, "incorrect_loss_raw": 13.44412612915039, "correct_loss_per_char": 0.36931146157754435, "incorrect_loss_per_char": 0.44688665477139394, "correct_loss_per_token": 1.7080655097961426, "incorrect_loss_per_token": 2.2666320467752126, "correct_loss_uncond": -20.126529693603516, "incorrect_loss_uncond": -11.161073684692383}, "model_output": [{"sum_logits": -11.59019660949707, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -24.147991180419922, "logits_per_token": -1.9316994349161785, "logits_per_char": -0.41393559319632395, "num_chars": 28}, {"sum_logits": -13.337987899780273, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -21.94110870361328, "logits_per_token": -2.6675975799560545, "logits_per_char": -0.4599306172338025, "num_chars": 29}, {"sum_logits": -15.404193878173828, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -27.726499557495117, "logits_per_token": -2.200599125453404, "logits_per_char": -0.4667937538840554, "num_chars": 33}, {"sum_logits": -13.66452407836914, "num_tokens": 8, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -33.791053771972656, "logits_per_token": -1.7080655097961426, "logits_per_char": -0.36931146157754435, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 810, "native_id": "Mercury_7017710", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 9.390851974487305, "incorrect_loss_raw": 11.831133524576822, "correct_loss_per_char": 0.2347712993621826, "incorrect_loss_per_char": 0.36522638124132917, "correct_loss_per_token": 1.173856496810913, "incorrect_loss_per_token": 1.8568408789458097, "correct_loss_uncond": -19.23397445678711, "incorrect_loss_uncond": -17.822532653808594}, "model_output": [{"sum_logits": -6.143949508666992, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -27.075931549072266, "logits_per_token": -1.535987377166748, "logits_per_char": -0.255997896194458, "num_chars": 24}, {"sum_logits": -8.70170783996582, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -24.90825653076172, "logits_per_token": -1.740341567993164, "logits_per_char": -0.34806831359863283, "num_chars": 25}, {"sum_logits": -9.390851974487305, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.624826431274414, "logits_per_token": -1.173856496810913, "logits_per_char": -0.2347712993621826, "num_chars": 40}, {"sum_logits": -20.647743225097656, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -36.976810455322266, "logits_per_token": -2.2941936916775174, "logits_per_char": -0.49161293393089656, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 811, "native_id": "Mercury_7210123", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.370061874389648, "incorrect_loss_raw": 10.74121618270874, "correct_loss_per_char": 0.671257734298706, "incorrect_loss_per_char": 0.6797215586034661, "correct_loss_per_token": 2.685030937194824, "incorrect_loss_per_token": 4.211221112145318, "correct_loss_uncond": -7.618987083435059, "incorrect_loss_uncond": -6.9242485364278155}, "model_output": [{"sum_logits": -5.370061874389648, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -12.989048957824707, "logits_per_token": -2.685030937194824, "logits_per_char": -0.671257734298706, "num_chars": 8}, {"sum_logits": -3.66837739944458, "num_tokens": 3, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -16.541784286499023, "logits_per_token": -1.2227924664815266, "logits_per_char": -0.30569811662038165, "num_chars": 12}, {"sum_logits": -11.354682922363281, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -14.1556978225708, "logits_per_token": -5.677341461181641, "logits_per_char": -0.8734371478740985, "num_chars": 13}, {"sum_logits": -17.20058822631836, "num_tokens": 3, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -22.298912048339844, "logits_per_token": -5.733529408772786, "logits_per_char": -0.8600294113159179, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 812, "native_id": "MCAS_2009_5_6519", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.15363073348999, "incorrect_loss_raw": 12.291255633036295, "correct_loss_per_char": 0.24541098730904715, "incorrect_loss_per_char": 0.708946629276982, "correct_loss_per_token": 1.030726146697998, "incorrect_loss_per_token": 3.8865890608893507, "correct_loss_uncond": -20.167831897735596, "incorrect_loss_uncond": -9.020042737325033}, "model_output": [{"sum_logits": -9.552458763122559, "num_tokens": 2, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -18.611854553222656, "logits_per_token": -4.776229381561279, "logits_per_char": -0.8684053421020508, "num_chars": 11}, {"sum_logits": -10.644571304321289, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -22.596105575561523, "logits_per_token": -3.548190434773763, "logits_per_char": -0.5913650724622939, "num_chars": 18}, {"sum_logits": -5.15363073348999, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -25.321462631225586, "logits_per_token": -1.030726146697998, "logits_per_char": -0.24541098730904715, "num_chars": 21}, {"sum_logits": -16.67673683166504, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -22.725934982299805, "logits_per_token": -3.335347366333008, "logits_per_char": -0.6670694732666016, "num_chars": 25}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 813, "native_id": "Mercury_401502", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.422788619995117, "incorrect_loss_raw": 26.394664128621418, "correct_loss_per_char": 3.5528485774993896, "incorrect_loss_per_char": 3.6034971418834867, "correct_loss_per_token": 5.684557723999023, "incorrect_loss_per_token": 5.278932825724284, "correct_loss_uncond": 3.1037349700927734, "incorrect_loss_uncond": 3.615137736002604}, "model_output": [{"sum_logits": -26.34065055847168, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -21.965076446533203, "logits_per_token": -5.268130111694336, "logits_per_char": -3.7629500797816684, "num_chars": 7}, {"sum_logits": -24.758922576904297, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -21.097412109375, "logits_per_token": -4.95178451538086, "logits_per_char": -3.5369889395577565, "num_chars": 7}, {"sum_logits": -28.08441925048828, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -25.276090621948242, "logits_per_token": -5.616883850097656, "logits_per_char": -3.510552406311035, "num_chars": 8}, {"sum_logits": -28.422788619995117, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -25.319053649902344, "logits_per_token": -5.684557723999023, "logits_per_char": -3.5528485774993896, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 814, "native_id": "Mercury_7109498", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 23.17046356201172, "incorrect_loss_raw": 34.80007807413737, "correct_loss_per_char": 0.49298858642578125, "incorrect_loss_per_char": 0.7092322693262892, "correct_loss_per_token": 3.3100662231445312, "incorrect_loss_per_token": 3.4922058643438874, "correct_loss_uncond": -18.551952362060547, "incorrect_loss_uncond": -11.862740834554037}, "model_output": [{"sum_logits": -33.77623748779297, "num_tokens": 8, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -40.635955810546875, "logits_per_token": -4.222029685974121, "logits_per_char": -0.8041961306617373, "num_chars": 42}, {"sum_logits": -23.17046356201172, "num_tokens": 7, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -41.722415924072266, "logits_per_token": -3.3100662231445312, "logits_per_char": -0.49298858642578125, "num_chars": 47}, {"sum_logits": -35.61882019042969, "num_tokens": 10, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -50.95404815673828, "logits_per_token": -3.5618820190429688, "logits_per_char": -0.6984082390280331, "num_chars": 51}, {"sum_logits": -35.00517654418945, "num_tokens": 13, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -48.39845275878906, "logits_per_token": -2.692705888014573, "logits_per_char": -0.6250924382890973, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 815, "native_id": "VASoL_2008_5_10", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.20860481262207, "incorrect_loss_raw": 12.240961074829102, "correct_loss_per_char": 0.8622003702016977, "incorrect_loss_per_char": 0.7212886010016595, "correct_loss_per_token": 3.736201604207357, "incorrect_loss_per_token": 3.848270575205485, "correct_loss_uncond": -4.12896728515625, "incorrect_loss_uncond": -3.831183115641276}, "model_output": [{"sum_logits": -11.20860481262207, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -15.33757209777832, "logits_per_token": -3.736201604207357, "logits_per_char": -0.8622003702016977, "num_chars": 13}, {"sum_logits": -11.561203002929688, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -17.408370971679688, "logits_per_token": -2.890300750732422, "logits_per_char": -0.7225751876831055, "num_chars": 16}, {"sum_logits": -9.456363677978516, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -15.272211074829102, "logits_per_token": -4.728181838989258, "logits_per_char": -0.727412590613732, "num_chars": 13}, {"sum_logits": -15.705316543579102, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.535850524902344, "logits_per_token": -3.9263291358947754, "logits_per_char": -0.713878024708141, "num_chars": 22}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 816, "native_id": "MCAS_2006_9_4", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.44879913330078, "incorrect_loss_raw": 21.018490473429363, "correct_loss_per_char": 0.5381262929815995, "incorrect_loss_per_char": 0.34952708188212095, "correct_loss_per_token": 2.5560998916625977, "incorrect_loss_per_token": 1.400018569676563, "correct_loss_uncond": -19.308486938476562, "incorrect_loss_uncond": -22.693427403767902}, "model_output": [{"sum_logits": -20.44879913330078, "num_tokens": 8, "num_tokens_all": 262, "is_greedy": false, "sum_logits_uncond": -39.757286071777344, "logits_per_token": -2.5560998916625977, "logits_per_char": -0.5381262929815995, "num_chars": 38}, {"sum_logits": -13.725601196289062, "num_tokens": 11, "num_tokens_all": 265, "is_greedy": false, "sum_logits_uncond": -33.82229995727539, "logits_per_token": -1.2477819269353694, "logits_per_char": -0.3050133599175347, "num_chars": 45}, {"sum_logits": -14.573408126831055, "num_tokens": 12, "num_tokens_all": 266, "is_greedy": false, "sum_logits_uncond": -36.85713195800781, "logits_per_token": -1.2144506772359211, "logits_per_char": -0.3036126693089803, "num_chars": 48}, {"sum_logits": -34.75646209716797, "num_tokens": 20, "num_tokens_all": 274, "is_greedy": false, "sum_logits_uncond": -60.456321716308594, "logits_per_token": -1.7378231048583985, "logits_per_char": -0.4399552164198477, "num_chars": 79}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 817, "native_id": "Mercury_402341", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.314811706542969, "incorrect_loss_raw": 9.393874486287435, "correct_loss_per_char": 0.6209874471028646, "incorrect_loss_per_char": 0.626258299085829, "correct_loss_per_token": 1.8629623413085938, "incorrect_loss_per_token": 1.6784163686964249, "correct_loss_uncond": -10.479303359985352, "incorrect_loss_uncond": -11.816960652669271}, "model_output": [{"sum_logits": -9.022601127624512, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -17.713075637817383, "logits_per_token": -1.5037668546040852, "logits_per_char": -0.6015067418416341, "num_chars": 15}, {"sum_logits": -9.009666442871094, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -19.319435119628906, "logits_per_token": -1.5016110738118489, "logits_per_char": -0.6006444295247396, "num_chars": 15}, {"sum_logits": -10.1493558883667, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -26.599994659423828, "logits_per_token": -2.02987117767334, "logits_per_char": -0.6766237258911133, "num_chars": 15}, {"sum_logits": -9.314811706542969, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -19.79411506652832, "logits_per_token": -1.8629623413085938, "logits_per_char": -0.6209874471028646, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 818, "native_id": "MCAS_2006_9_34", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.662836074829102, "incorrect_loss_raw": 8.353341261545816, "correct_loss_per_char": 1.5325672149658203, "incorrect_loss_per_char": 1.6231795083908809, "correct_loss_per_token": 3.831418037414551, "incorrect_loss_per_token": 3.3840838273366294, "correct_loss_uncond": -9.247182846069336, "incorrect_loss_uncond": -7.454970200856526}, "model_output": [{"sum_logits": -9.511041641235352, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -18.1871337890625, "logits_per_token": -2.377760410308838, "logits_per_char": -1.3587202344621931, "num_chars": 7}, {"sum_logits": -8.020437240600586, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -13.946453094482422, "logits_per_token": -4.010218620300293, "logits_per_char": -2.0051093101501465, "num_chars": 4}, {"sum_logits": -7.528544902801514, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -15.29134750366211, "logits_per_token": -3.764272451400757, "logits_per_char": -1.5057089805603028, "num_chars": 5}, {"sum_logits": -7.662836074829102, "num_tokens": 2, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -16.910018920898438, "logits_per_token": -3.831418037414551, "logits_per_char": -1.5325672149658203, "num_chars": 5}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 819, "native_id": "Mercury_7267715", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.501544952392578, "incorrect_loss_raw": 15.76910654703776, "correct_loss_per_char": 0.6707693890827459, "incorrect_loss_per_char": 0.5006589892913956, "correct_loss_per_token": 3.9287921360560825, "incorrect_loss_per_token": 2.4449584516898666, "correct_loss_uncond": -17.624656677246094, "incorrect_loss_uncond": -16.12699826558431}, "model_output": [{"sum_logits": -18.506444931030273, "num_tokens": 5, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -31.397245407104492, "logits_per_token": -3.7012889862060545, "logits_per_char": -0.7711018721262614, "num_chars": 24}, {"sum_logits": -13.654909133911133, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -28.743892669677734, "logits_per_token": -1.9507013048444475, "logits_per_char": -0.401614974526798, "num_chars": 34}, {"sum_logits": -27.501544952392578, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -45.12620162963867, "logits_per_token": -3.9287921360560825, "logits_per_char": -0.6707693890827459, "num_chars": 41}, {"sum_logits": -15.145965576171875, "num_tokens": 9, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -35.547176361083984, "logits_per_token": -1.6828850640190973, "logits_per_char": -0.32926012122112774, "num_chars": 46}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 820, "native_id": "Mercury_SC_413089", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.869827270507812, "incorrect_loss_raw": 20.296554883321125, "correct_loss_per_char": 0.4202977960759943, "incorrect_loss_per_char": 0.5202586002739514, "correct_loss_per_token": 1.5410919189453125, "incorrect_loss_per_token": 2.180310348228172, "correct_loss_uncond": -10.162857055664062, "incorrect_loss_uncond": -10.136022885640463}, "model_output": [{"sum_logits": -25.804588317871094, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -35.31243896484375, "logits_per_token": -2.867176479763455, "logits_per_char": -0.6293802028749047, "num_chars": 41}, {"sum_logits": -14.872223854064941, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.731246948242188, "logits_per_token": -1.652469317118327, "logits_per_char": -0.4131173292795817, "num_chars": 36}, {"sum_logits": -20.212852478027344, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -30.254047393798828, "logits_per_token": -2.0212852478027346, "logits_per_char": -0.5182782686673678, "num_chars": 39}, {"sum_logits": -13.869827270507812, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -24.032684326171875, "logits_per_token": -1.5410919189453125, "logits_per_char": -0.4202977960759943, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 821, "native_id": "Mercury_SC_401656", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.321609497070312, "incorrect_loss_raw": 22.779917399088543, "correct_loss_per_char": 0.6764124090021307, "incorrect_loss_per_char": 0.6900954986207944, "correct_loss_per_token": 3.7202682495117188, "incorrect_loss_per_token": 3.4261813844953264, "correct_loss_uncond": -9.374330520629883, "incorrect_loss_uncond": -9.595634460449219}, "model_output": [{"sum_logits": -22.321609497070312, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -31.695940017700195, "logits_per_token": -3.7202682495117188, "logits_per_char": -0.6764124090021307, "num_chars": 33}, {"sum_logits": -21.10573959350586, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -26.55809783935547, "logits_per_token": -3.015105656215123, "logits_per_char": -0.6030211312430246, "num_chars": 35}, {"sum_logits": -21.660341262817383, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -32.26982116699219, "logits_per_token": -3.6100568771362305, "logits_per_char": -0.5854146287247941, "num_chars": 37}, {"sum_logits": -25.573671340942383, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -38.298736572265625, "logits_per_token": -3.6533816201346263, "logits_per_char": -0.881850735894565, "num_chars": 29}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 822, "native_id": "Mercury_407019", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.843714714050293, "incorrect_loss_raw": 21.11848799387614, "correct_loss_per_char": 0.23573292856631073, "incorrect_loss_per_char": 0.5327742263365433, "correct_loss_per_token": 1.3554643392562866, "incorrect_loss_per_token": 2.9193443684350875, "correct_loss_uncond": -16.131279945373535, "incorrect_loss_uncond": -13.295419375101725}, "model_output": [{"sum_logits": -7.880793571472168, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -23.686927795410156, "logits_per_token": -1.3134655952453613, "logits_per_char": -0.2251655306134905, "num_chars": 35}, {"sum_logits": -28.573087692260742, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -42.52153778076172, "logits_per_token": -4.0818696703229636, "logits_per_char": -0.7326432741605319, "num_chars": 39}, {"sum_logits": -26.901582717895508, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -37.03325653076172, "logits_per_token": -3.3626978397369385, "logits_per_char": -0.6405138742356074, "num_chars": 42}, {"sum_logits": -10.843714714050293, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.974994659423828, "logits_per_token": -1.3554643392562866, "logits_per_char": -0.23573292856631073, "num_chars": 46}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 823, "native_id": "Mercury_417128", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.8968563079834, "incorrect_loss_raw": 25.68781026204427, "correct_loss_per_char": 0.65674673427235, "incorrect_loss_per_char": 0.5291188906722385, "correct_loss_per_token": 3.612107038497925, "incorrect_loss_per_token": 2.7029231996247263, "correct_loss_uncond": -10.133584976196289, "incorrect_loss_uncond": -12.875197092692057}, "model_output": [{"sum_logits": -27.156726837158203, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -36.09092712402344, "logits_per_token": -3.3945908546447754, "logits_per_char": -0.6789181709289551, "num_chars": 40}, {"sum_logits": -28.8968563079834, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -39.03044128417969, "logits_per_token": -3.612107038497925, "logits_per_char": -0.65674673427235, "num_chars": 44}, {"sum_logits": -19.49262237548828, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.89468002319336, "logits_per_token": -1.9492622375488282, "logits_per_char": -0.37485812260554385, "num_chars": 52}, {"sum_logits": -30.414081573486328, "num_tokens": 11, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -46.70341491699219, "logits_per_token": -2.764916506680575, "logits_per_char": -0.5335803784822163, "num_chars": 57}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 824, "native_id": "Mercury_7081305", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.220745086669922, "incorrect_loss_raw": 12.921184857686361, "correct_loss_per_char": 0.3389934637607672, "incorrect_loss_per_char": 0.36917671021961024, "correct_loss_per_token": 1.6525931358337402, "incorrect_loss_per_token": 1.9004543452035811, "correct_loss_uncond": -15.281387329101562, "incorrect_loss_uncond": -17.23719310760498}, "model_output": [{"sum_logits": -12.542707443237305, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.04566192626953, "logits_per_token": -1.7918153490339006, "logits_per_char": -0.3583630698067801, "num_chars": 35}, {"sum_logits": -15.166603088378906, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -28.18345832824707, "logits_per_token": -2.5277671813964844, "logits_per_char": -0.4333315168108259, "num_chars": 35}, {"sum_logits": -11.054244041442871, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.246013641357422, "logits_per_token": -1.3817805051803589, "logits_per_char": -0.3158355440412249, "num_chars": 35}, {"sum_logits": -13.220745086669922, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.502132415771484, "logits_per_token": -1.6525931358337402, "logits_per_char": -0.3389934637607672, "num_chars": 39}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 825, "native_id": "NYSEDREGENTS_2015_8_3", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.715435028076172, "incorrect_loss_raw": 22.507687250773113, "correct_loss_per_char": 0.4928858757019043, "incorrect_loss_per_char": 0.5628608643217988, "correct_loss_per_token": 2.816490718296596, "incorrect_loss_per_token": 3.2282401228707935, "correct_loss_uncond": -15.995864868164062, "incorrect_loss_uncond": -12.251843134562174}, "model_output": [{"sum_logits": -21.645111083984375, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -35.27947235107422, "logits_per_token": -3.092158726283482, "logits_per_char": -0.5696081864206415, "num_chars": 38}, {"sum_logits": -19.715435028076172, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -35.711299896240234, "logits_per_token": -2.816490718296596, "logits_per_char": -0.4928858757019043, "num_chars": 40}, {"sum_logits": -25.29032325744629, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -35.81958770751953, "logits_per_token": -3.161290407180786, "logits_per_char": -0.6168371526206412, "num_chars": 41}, {"sum_logits": -20.587627410888672, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -33.17953109741211, "logits_per_token": -3.431271235148112, "logits_per_char": -0.5021372539241139, "num_chars": 41}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 826, "native_id": "MEA_2016_8_15", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 21.989604949951172, "incorrect_loss_raw": 24.879692713419598, "correct_loss_per_char": 0.4311687245088465, "incorrect_loss_per_char": 0.5121848875879266, "correct_loss_per_token": 1.8324670791625977, "incorrect_loss_per_token": 2.0733077261182995, "correct_loss_uncond": -11.659137725830078, "incorrect_loss_uncond": -11.22334098815918}, "model_output": [{"sum_logits": -21.989604949951172, "num_tokens": 12, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -33.64874267578125, "logits_per_token": -1.8324670791625977, "logits_per_char": -0.4311687245088465, "num_chars": 51}, {"sum_logits": -25.914390563964844, "num_tokens": 12, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -36.53007125854492, "logits_per_token": -2.1595325469970703, "logits_per_char": -0.551370011999252, "num_chars": 47}, {"sum_logits": -23.546194076538086, "num_tokens": 12, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.12055206298828, "logits_per_token": -1.962182839711507, "logits_per_char": -0.5009828526922997, "num_chars": 47}, {"sum_logits": -25.17849349975586, "num_tokens": 12, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -37.658477783203125, "logits_per_token": -2.0982077916463218, "logits_per_char": -0.4842017980722281, "num_chars": 52}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 827, "native_id": "ACTAAP_2015_7_9", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.633984565734863, "incorrect_loss_raw": 17.596485137939453, "correct_loss_per_char": 0.4598230754627901, "incorrect_loss_per_char": 0.5492579382496806, "correct_loss_per_token": 1.954248070716858, "incorrect_loss_per_token": 2.395260458900815, "correct_loss_uncond": -8.232100486755371, "incorrect_loss_uncond": -5.989879608154297}, "model_output": [{"sum_logits": -14.954927444458008, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -20.714773178100586, "logits_per_token": -2.136418206351144, "logits_per_char": -0.48241701433735507, "num_chars": 31}, {"sum_logits": -17.92264175415039, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.900911331176758, "logits_per_token": -2.560377393450056, "logits_per_char": -0.5431103561863755, "num_chars": 33}, {"sum_logits": -19.91188621520996, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.143409729003906, "logits_per_token": -2.488985776901245, "logits_per_char": -0.6222464442253113, "num_chars": 32}, {"sum_logits": -15.633984565734863, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.866085052490234, "logits_per_token": -1.954248070716858, "logits_per_char": -0.4598230754627901, "num_chars": 34}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 828, "native_id": "Mercury_7216423", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.090214729309082, "incorrect_loss_raw": 12.296812693277994, "correct_loss_per_char": 0.6233435585385277, "incorrect_loss_per_char": 0.7895215871226028, "correct_loss_per_token": 6.545107364654541, "incorrect_loss_per_token": 6.00279426574707, "correct_loss_uncond": -6.934012413024902, "incorrect_loss_uncond": -9.633979797363281}, "model_output": [{"sum_logits": -21.69710922241211, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -31.411823272705078, "logits_per_token": -7.232369740804036, "logits_per_char": -0.943352574887483, "num_chars": 23}, {"sum_logits": -13.090214729309082, "num_tokens": 2, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -20.024227142333984, "logits_per_token": -6.545107364654541, "logits_per_char": -0.6233435585385277, "num_chars": 21}, {"sum_logits": -8.567355155944824, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -13.285404205322266, "logits_per_token": -8.567355155944824, "logits_per_char": -0.951928350660536, "num_chars": 9}, {"sum_logits": -6.625973701477051, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -21.095149993896484, "logits_per_token": -2.20865790049235, "logits_per_char": -0.47328383581978933, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 829, "native_id": "Mercury_416633", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.427778244018555, "incorrect_loss_raw": 4.702413082122803, "correct_loss_per_char": 0.3689815203348796, "incorrect_loss_per_char": 0.4180242485470242, "correct_loss_per_token": 2.2138891220092773, "incorrect_loss_per_token": 3.2961747646331787, "correct_loss_uncond": -9.651744842529297, "incorrect_loss_uncond": -10.87522300084432}, "model_output": [{"sum_logits": -4.427778244018555, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.079523086547852, "logits_per_token": -2.2138891220092773, "logits_per_char": -0.3689815203348796, "num_chars": 12}, {"sum_logits": -4.321730136871338, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.459976196289062, "logits_per_token": -2.160865068435669, "logits_per_char": -0.2881153424580892, "num_chars": 15}, {"sum_logits": -5.669809341430664, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.022527694702148, "logits_per_token": -5.669809341430664, "logits_per_char": -0.708726167678833, "num_chars": 8}, {"sum_logits": -4.115699768066406, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.250404357910156, "logits_per_token": -2.057849884033203, "logits_per_char": -0.2572312355041504, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 830, "native_id": "Mercury_7038518", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.59918975830078, "incorrect_loss_raw": 14.558698972066244, "correct_loss_per_char": 0.6149797439575195, "incorrect_loss_per_char": 0.6926560854266226, "correct_loss_per_token": 4.099864959716797, "incorrect_loss_per_token": 3.7503177854749894, "correct_loss_uncond": -3.5013275146484375, "incorrect_loss_uncond": -6.365849494934082}, "model_output": [{"sum_logits": -19.066402435302734, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -18.609149932861328, "logits_per_token": -6.355467478434245, "logits_per_char": -1.1215530844295727, "num_chars": 17}, {"sum_logits": -9.52644157409668, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -20.224218368530273, "logits_per_token": -2.38161039352417, "logits_per_char": -0.4536400749569848, "num_chars": 21}, {"sum_logits": -15.083252906799316, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -23.940277099609375, "logits_per_token": -2.5138754844665527, "logits_per_char": -0.5027750968933106, "num_chars": 30}, {"sum_logits": -24.59918975830078, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -28.10051727294922, "logits_per_token": -4.099864959716797, "logits_per_char": -0.6149797439575195, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 831, "native_id": "Mercury_7085225", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.246770858764648, "incorrect_loss_raw": 6.218588034311931, "correct_loss_per_char": 0.905846357345581, "incorrect_loss_per_char": 0.718720120010954, "correct_loss_per_token": 3.623385429382324, "incorrect_loss_per_token": 4.21675968170166, "correct_loss_uncond": -9.176189422607422, "incorrect_loss_uncond": -9.246239821116129}, "model_output": [{"sum_logits": -6.64479398727417, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -12.918476104736328, "logits_per_token": -6.64479398727417, "logits_per_char": -0.8305992484092712, "num_chars": 8}, {"sum_logits": -7.246770858764648, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -16.42296028137207, "logits_per_token": -3.623385429382324, "logits_per_char": -0.905846357345581, "num_chars": 8}, {"sum_logits": -6.853872299194336, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -16.67954444885254, "logits_per_token": -3.426936149597168, "logits_per_char": -0.856734037399292, "num_chars": 8}, {"sum_logits": -5.157097816467285, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -16.796463012695312, "logits_per_token": -2.5785489082336426, "logits_per_char": -0.46882707422429865, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 832, "native_id": "LEAP__4_10225", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.357096672058105, "incorrect_loss_raw": 12.9979248046875, "correct_loss_per_char": 0.309522098965115, "incorrect_loss_per_char": 0.4270185304209617, "correct_loss_per_token": 1.3928494453430176, "incorrect_loss_per_token": 1.856846400669643, "correct_loss_uncond": -11.227171897888184, "incorrect_loss_uncond": -7.559861501057942}, "model_output": [{"sum_logits": -8.351551055908203, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -18.457138061523438, "logits_per_token": -1.1930787222726005, "logits_per_char": -0.2694048727712324, "num_chars": 31}, {"sum_logits": -8.357096672058105, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -19.58426856994629, "logits_per_token": -1.3928494453430176, "logits_per_char": -0.309522098965115, "num_chars": 27}, {"sum_logits": -16.729129791259766, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -22.11202621459961, "logits_per_token": -2.389875684465681, "logits_per_char": -0.5768665445261988, "num_chars": 29}, {"sum_logits": -13.913093566894531, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -21.10419464111328, "logits_per_token": -1.9875847952706474, "logits_per_char": -0.4347841739654541, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 833, "native_id": "Mercury_SC_401661", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.772716522216797, "incorrect_loss_raw": 13.717846552530924, "correct_loss_per_char": 0.6386358261108398, "incorrect_loss_per_char": 0.6435181229059834, "correct_loss_per_token": 4.257572174072266, "incorrect_loss_per_token": 3.646551900439792, "correct_loss_uncond": -8.44991683959961, "incorrect_loss_uncond": -10.168833414713541}, "model_output": [{"sum_logits": -13.86353588104248, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -23.912172317504883, "logits_per_token": -3.46588397026062, "logits_per_char": -0.7701964378356934, "num_chars": 18}, {"sum_logits": -7.815249443054199, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -20.516841888427734, "logits_per_token": -2.605083147684733, "logits_per_char": -0.41132891805548416, "num_chars": 19}, {"sum_logits": -19.474754333496094, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.23102569580078, "logits_per_token": -4.868688583374023, "logits_per_char": -0.7490290128267728, "num_chars": 26}, {"sum_logits": -12.772716522216797, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.222633361816406, "logits_per_token": -4.257572174072266, "logits_per_char": -0.6386358261108398, "num_chars": 20}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 834, "native_id": "TIMSS_1995_8_Q15", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.30521297454834, "incorrect_loss_raw": 9.098140557607016, "correct_loss_per_char": 0.6646580696105957, "incorrect_loss_per_char": 0.6998569659697704, "correct_loss_per_token": 3.10173765818278, "incorrect_loss_per_token": 3.032713519202338, "correct_loss_uncond": -4.316872596740723, "incorrect_loss_uncond": -7.882794539133708}, "model_output": [{"sum_logits": -9.30521297454834, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.622085571289062, "logits_per_token": -3.10173765818278, "logits_per_char": -0.6646580696105957, "num_chars": 14}, {"sum_logits": -10.960261344909668, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -19.866363525390625, "logits_per_token": -3.6534204483032227, "logits_per_char": -0.843097026531513, "num_chars": 13}, {"sum_logits": -7.765157222747803, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.905150413513184, "logits_per_token": -2.588385740915934, "logits_per_char": -0.5973197863652155, "num_chars": 13}, {"sum_logits": -8.569003105163574, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.17129135131836, "logits_per_token": -2.856334368387858, "logits_per_char": -0.6591540850125827, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 835, "native_id": "MCAS_1999_4_23", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.876577377319336, "incorrect_loss_raw": 26.024651845296223, "correct_loss_per_char": 0.3309132819082223, "incorrect_loss_per_char": 0.566465394443139, "correct_loss_per_token": 1.5342343070290305, "incorrect_loss_per_token": 2.6675782983953304, "correct_loss_uncond": -12.129467010498047, "incorrect_loss_uncond": -11.182050069173178}, "model_output": [{"sum_logits": -16.876577377319336, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -29.006044387817383, "logits_per_token": -1.5342343070290305, "logits_per_char": -0.3309132819082223, "num_chars": 51}, {"sum_logits": -34.418540954589844, "num_tokens": 11, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -43.39226150512695, "logits_per_token": -3.1289582685990767, "logits_per_char": -0.6883708190917969, "num_chars": 50}, {"sum_logits": -23.326007843017578, "num_tokens": 10, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -36.73505401611328, "logits_per_token": -2.332600784301758, "logits_per_char": -0.4760409763881138, "num_chars": 49}, {"sum_logits": -20.32940673828125, "num_tokens": 8, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -31.49279022216797, "logits_per_token": -2.5411758422851562, "logits_per_char": -0.5349843878495065, "num_chars": 38}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 836, "native_id": "TIMSS_1995_8_J7", "metrics": {"predicted_index_raw": 4, "predicted_index_per_token": 4, "predicted_index_per_char": 4, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 40.427085876464844, "incorrect_loss_raw": 18.91752338409424, "correct_loss_per_char": 0.39249597938315384, "incorrect_loss_per_char": 0.325959323363006, "correct_loss_per_token": 2.378063875086167, "incorrect_loss_per_token": 1.522662688524295, "correct_loss_uncond": -12.829238891601562, "incorrect_loss_uncond": -13.980563163757324}, "model_output": [{"sum_logits": -18.194538116455078, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -32.761444091796875, "logits_per_token": -1.3995798551119292, "logits_per_char": -0.2888021923246838, "num_chars": 63}, {"sum_logits": -22.444669723510742, "num_tokens": 12, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -34.23464584350586, "logits_per_token": -1.8703891436258953, "logits_per_char": -0.4080849040638317, "num_chars": 55}, {"sum_logits": -19.655725479125977, "num_tokens": 12, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.13142013549805, "logits_per_token": -1.637977123260498, "logits_per_char": -0.3222250078545242, "num_chars": 61}, {"sum_logits": -40.427085876464844, "num_tokens": 17, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -53.256324768066406, "logits_per_token": -2.378063875086167, "logits_per_char": -0.39249597938315384, "num_chars": 103}, {"sum_logits": -15.375160217285156, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -29.46483612060547, "logits_per_token": -1.1827046320988581, "logits_per_char": -0.2847251892089844, "num_chars": 54}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 837, "native_id": "Mercury_SC_LBS10018", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 1.8820204734802246, "incorrect_loss_raw": 8.120880126953125, "correct_loss_per_char": 0.3136700789133708, "incorrect_loss_per_char": 0.6483681433647663, "correct_loss_per_token": 0.9410102367401123, "incorrect_loss_per_token": 2.623850425084432, "correct_loss_uncond": -14.344313144683838, "incorrect_loss_uncond": -12.980504035949707}, "model_output": [{"sum_logits": -1.8820204734802246, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.226333618164062, "logits_per_token": -0.9410102367401123, "logits_per_char": -0.3136700789133708, "num_chars": 6}, {"sum_logits": -7.123564720153809, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.213923454284668, "logits_per_token": -3.5617823600769043, "logits_per_char": -1.0176521028791154, "num_chars": 7}, {"sum_logits": -10.353741645812988, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.55978012084961, "logits_per_token": -2.588435411453247, "logits_per_char": -0.5449337708322626, "num_chars": 19}, {"sum_logits": -6.885334014892578, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.53044891357422, "logits_per_token": -1.7213335037231445, "logits_per_char": -0.382518556382921, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 838, "native_id": "Mercury_SC_406855", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.801097869873047, "incorrect_loss_raw": 28.405197779337566, "correct_loss_per_char": 0.7200274467468262, "incorrect_loss_per_char": 0.7598515747192328, "correct_loss_per_token": 2.6182816245339136, "incorrect_loss_per_token": 3.377950253310027, "correct_loss_uncond": -11.687602996826172, "incorrect_loss_uncond": -10.628798166910807}, "model_output": [{"sum_logits": -37.30308532714844, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -50.11851501464844, "logits_per_token": -4.144787258572048, "logits_per_char": -0.9816601401881168, "num_chars": 38}, {"sum_logits": -18.848604202270508, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -29.473859786987305, "logits_per_token": -2.3560755252838135, "logits_per_char": -0.5890188813209534, "num_chars": 32}, {"sum_logits": -29.06390380859375, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.509613037109375, "logits_per_token": -3.6329879760742188, "logits_per_char": -0.7088757026486281, "num_chars": 41}, {"sum_logits": -28.801097869873047, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -40.48870086669922, "logits_per_token": -2.6182816245339136, "logits_per_char": -0.7200274467468262, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 839, "native_id": "Mercury_SC_415457", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.942399978637695, "incorrect_loss_raw": 4.382819175720215, "correct_loss_per_char": 0.7961599985758464, "incorrect_loss_per_char": 0.3200260897678157, "correct_loss_per_token": 3.980799992879232, "incorrect_loss_per_token": 1.6652594672309027, "correct_loss_uncond": -11.66642951965332, "incorrect_loss_uncond": -11.779729525248209}, "model_output": [{"sum_logits": -4.131930351257324, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -17.119342803955078, "logits_per_token": -1.3773101170857747, "logits_per_char": -0.22955168618096244, "num_chars": 18}, {"sum_logits": -3.677755355834961, "num_tokens": 2, "num_tokens_all": 179, "is_greedy": true, "sum_logits_uncond": -15.346242904663086, "logits_per_token": -1.8388776779174805, "logits_per_char": -0.2451836903889974, "num_chars": 15}, {"sum_logits": -5.338771820068359, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.02206039428711, "logits_per_token": -1.7795906066894531, "logits_per_char": -0.48534289273348724, "num_chars": 11}, {"sum_logits": -11.942399978637695, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -23.608829498291016, "logits_per_token": -3.980799992879232, "logits_per_char": -0.7961599985758464, "num_chars": 15}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 840, "native_id": "NYSEDREGENTS_2015_4_25", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.63137435913086, "incorrect_loss_raw": 22.541765213012695, "correct_loss_per_char": 0.5175381766425239, "incorrect_loss_per_char": 0.6261601448059082, "correct_loss_per_token": 2.6616249084472656, "incorrect_loss_per_token": 3.220252173287528, "correct_loss_uncond": -15.013572692871094, "incorrect_loss_uncond": -15.661319096883139}, "model_output": [{"sum_logits": -21.362016677856445, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -37.78384017944336, "logits_per_token": -3.0517166682652066, "logits_per_char": -0.5933893521626791, "num_chars": 36}, {"sum_logits": -23.31836700439453, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -39.26555633544922, "logits_per_token": -3.331195286342076, "logits_per_char": -0.6477324167887369, "num_chars": 36}, {"sum_logits": -18.63137435913086, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.64494705200195, "logits_per_token": -2.6616249084472656, "logits_per_char": -0.5175381766425239, "num_chars": 36}, {"sum_logits": -22.94491195678711, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -37.55985641479492, "logits_per_token": -3.277844565255301, "logits_per_char": -0.6373586654663086, "num_chars": 36}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 841, "native_id": "Mercury_7058135", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.88631534576416, "incorrect_loss_raw": 8.745881716410318, "correct_loss_per_char": 0.24289470911026, "incorrect_loss_per_char": 0.4364530411977617, "correct_loss_per_token": 0.97157883644104, "incorrect_loss_per_token": 2.0219487031300862, "correct_loss_uncond": -19.33512020111084, "incorrect_loss_uncond": -14.844394365946451}, "model_output": [{"sum_logits": -4.591987609863281, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.95916748046875, "logits_per_token": -1.1479969024658203, "logits_per_char": -0.2551104227701823, "num_chars": 18}, {"sum_logits": -3.88631534576416, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -23.221435546875, "logits_per_token": -0.97157883644104, "logits_per_char": -0.24289470911026, "num_chars": 16}, {"sum_logits": -11.774353981018066, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -24.712303161621094, "logits_per_token": -2.9435884952545166, "logits_per_char": -0.5606835229056222, "num_chars": 21}, {"sum_logits": -9.87130355834961, "num_tokens": 5, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -26.09935760498047, "logits_per_token": -1.9742607116699218, "logits_per_char": -0.49356517791748045, "num_chars": 20}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 842, "native_id": "MDSA_2008_4_19", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.7969226837158203, "incorrect_loss_raw": 5.795483589172363, "correct_loss_per_char": 0.37969226837158204, "incorrect_loss_per_char": 0.4994896941714817, "correct_loss_per_token": 1.8984613418579102, "incorrect_loss_per_token": 2.5044679641723633, "correct_loss_uncond": -14.451574325561523, "incorrect_loss_uncond": -11.737554868062338}, "model_output": [{"sum_logits": -5.817811965942383, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.869792938232422, "logits_per_token": -2.9089059829711914, "logits_per_char": -0.7272264957427979, "num_chars": 8}, {"sum_logits": -3.7969226837158203, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": true, "sum_logits_uncond": -18.248497009277344, "logits_per_token": -1.8984613418579102, "logits_per_char": -0.37969226837158204, "num_chars": 10}, {"sum_logits": -4.489709854125977, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.905935287475586, "logits_per_token": -2.2448549270629883, "logits_per_char": -0.2993139902750651, "num_chars": 15}, {"sum_logits": -7.0789289474487305, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -18.823387145996094, "logits_per_token": -2.35964298248291, "logits_per_char": -0.47192859649658203, "num_chars": 15}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 843, "native_id": "AKDE&ED_2008_8_45", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.1698055267334, "incorrect_loss_raw": 24.245569229125977, "correct_loss_per_char": 0.48339611053466797, "incorrect_loss_per_char": 0.5006190781690636, "correct_loss_per_token": 2.685533947414822, "incorrect_loss_per_token": 2.608640374077691, "correct_loss_uncond": -15.854307174682617, "incorrect_loss_uncond": -16.126312891642254}, "model_output": [{"sum_logits": -23.034175872802734, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -38.40934371948242, "logits_per_token": -2.3034175872802733, "logits_per_char": -0.47987866401672363, "num_chars": 48}, {"sum_logits": -17.972070693969727, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -38.217315673828125, "logits_per_token": -1.996896743774414, "logits_per_char": -0.37441813945770264, "num_chars": 48}, {"sum_logits": -31.73046112060547, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -44.48898696899414, "logits_per_token": -3.5256067911783853, "logits_per_char": -0.6475604310327646, "num_chars": 49}, {"sum_logits": -24.1698055267334, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -40.024112701416016, "logits_per_token": -2.685533947414822, "logits_per_char": -0.48339611053466797, "num_chars": 50}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 844, "native_id": "Mercury_7131758", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.9969372749328613, "incorrect_loss_raw": 4.549931526184082, "correct_loss_per_char": 0.24980857968330383, "incorrect_loss_per_char": 0.2668919260762319, "correct_loss_per_token": 1.9984686374664307, "incorrect_loss_per_token": 2.274965763092041, "correct_loss_uncond": -14.229194164276123, "incorrect_loss_uncond": -12.054152488708496}, "model_output": [{"sum_logits": -3.9969372749328613, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -18.226131439208984, "logits_per_token": -1.9984686374664307, "logits_per_char": -0.24980857968330383, "num_chars": 16}, {"sum_logits": -3.844143867492676, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -15.8555908203125, "logits_per_token": -1.922071933746338, "logits_per_char": -0.27458170482090544, "num_chars": 14}, {"sum_logits": -4.058640956878662, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.914688110351562, "logits_per_token": -2.029320478439331, "logits_per_char": -0.23874358569874482, "num_chars": 17}, {"sum_logits": -5.747009754180908, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -17.041973114013672, "logits_per_token": -2.873504877090454, "logits_per_char": -0.2873504877090454, "num_chars": 20}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 845, "native_id": "NYSEDREGENTS_2013_8_10", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.108158588409424, "incorrect_loss_raw": 5.851883411407471, "correct_loss_per_char": 0.6108158588409424, "incorrect_loss_per_char": 0.528663184092595, "correct_loss_per_token": 2.036052862803141, "incorrect_loss_per_token": 5.851883411407471, "correct_loss_uncond": -9.183188915252686, "incorrect_loss_uncond": -8.581531047821045}, "model_output": [{"sum_logits": -7.348270416259766, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -14.626947402954102, "logits_per_token": -7.348270416259766, "logits_per_char": -0.5652515704815204, "num_chars": 13}, {"sum_logits": -6.108158588409424, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -15.29134750366211, "logits_per_token": -2.036052862803141, "logits_per_char": -0.6108158588409424, "num_chars": 10}, {"sum_logits": -3.988924503326416, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -13.774347305297852, "logits_per_token": -3.988924503326416, "logits_per_char": -0.3988924503326416, "num_chars": 10}, {"sum_logits": -6.2184553146362305, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -14.898948669433594, "logits_per_token": -6.2184553146362305, "logits_per_char": -0.6218455314636231, "num_chars": 10}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 846, "native_id": "Mercury_SC_401783", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.473746299743652, "incorrect_loss_raw": 17.429203669230144, "correct_loss_per_char": 0.2989498519897461, "incorrect_loss_per_char": 0.7150550353221404, "correct_loss_per_token": 1.868436574935913, "incorrect_loss_per_token": 3.67954527537028, "correct_loss_uncond": -13.44623851776123, "incorrect_loss_uncond": -8.9212277730306}, "model_output": [{"sum_logits": -11.622272491455078, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -24.7491455078125, "logits_per_token": -2.9055681228637695, "logits_per_char": -0.581113624572754, "num_chars": 20}, {"sum_logits": -25.207305908203125, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -28.205087661743164, "logits_per_token": -5.041461181640625, "logits_per_char": -0.9695117657001202, "num_chars": 26}, {"sum_logits": -15.458032608032227, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.097061157226562, "logits_per_token": -3.0916065216064452, "logits_per_char": -0.5945397156935471, "num_chars": 26}, {"sum_logits": -7.473746299743652, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -20.919984817504883, "logits_per_token": -1.868436574935913, "logits_per_char": -0.2989498519897461, "num_chars": 25}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 847, "native_id": "Mercury_7190120", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.558673858642578, "incorrect_loss_raw": 21.434865315755207, "correct_loss_per_char": 0.501364508340525, "incorrect_loss_per_char": 0.518451243817155, "correct_loss_per_token": 2.395408206515842, "incorrect_loss_per_token": 2.65811486420808, "correct_loss_uncond": -11.297725677490234, "incorrect_loss_uncond": -8.122685750325521}, "model_output": [{"sum_logits": -19.708595275878906, "num_tokens": 6, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -26.56418228149414, "logits_per_token": -3.284765879313151, "logits_per_char": -0.5972301598751184, "num_chars": 33}, {"sum_logits": -21.558673858642578, "num_tokens": 9, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -32.85639953613281, "logits_per_token": -2.395408206515842, "logits_per_char": -0.501364508340525, "num_chars": 43}, {"sum_logits": -20.698078155517578, "num_tokens": 9, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -29.65298843383789, "logits_per_token": -2.2997864617241754, "logits_per_char": -0.47041086717085406, "num_chars": 44}, {"sum_logits": -23.89792251586914, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -32.455482482910156, "logits_per_token": -2.389792251586914, "logits_per_char": -0.4877127044054927, "num_chars": 49}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 848, "native_id": "Mercury_409317", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.890649795532227, "incorrect_loss_raw": 9.087164878845215, "correct_loss_per_char": 0.9146653688870944, "incorrect_loss_per_char": 0.7570195135073599, "correct_loss_per_token": 1.9817749659220378, "incorrect_loss_per_token": 1.6138679610358342, "correct_loss_uncond": -19.319074630737305, "incorrect_loss_uncond": -17.260998725891113}, "model_output": [{"sum_logits": -8.344670295715332, "num_tokens": 6, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -21.909099578857422, "logits_per_token": -1.390778382619222, "logits_per_char": -0.7586063905195757, "num_chars": 11}, {"sum_logits": -11.890649795532227, "num_tokens": 6, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -31.20972442626953, "logits_per_token": -1.9817749659220378, "logits_per_char": -0.9146653688870944, "num_chars": 13}, {"sum_logits": -9.976181030273438, "num_tokens": 6, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -29.383209228515625, "logits_per_token": -1.6626968383789062, "logits_per_char": -0.7673985407902644, "num_chars": 13}, {"sum_logits": -8.940643310546875, "num_tokens": 5, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -27.752182006835938, "logits_per_token": -1.788128662109375, "logits_per_char": -0.7450536092122396, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 849, "native_id": "Mercury_7268240", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 13.707928657531738, "incorrect_loss_raw": 21.43932565053304, "correct_loss_per_char": 0.2141863852739334, "incorrect_loss_per_char": 0.458354275335397, "correct_loss_per_token": 1.3707928657531738, "incorrect_loss_per_token": 2.7167286122286765, "correct_loss_uncond": -14.300023078918457, "incorrect_loss_uncond": -12.917679150899252}, "model_output": [{"sum_logits": -14.49112319946289, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.199142456054688, "logits_per_token": -2.415187199910482, "logits_per_char": -0.35344202925519247, "num_chars": 41}, {"sum_logits": -14.305071830749512, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.316261291503906, "logits_per_token": -1.788133978843689, "logits_per_char": -0.3251152688806707, "num_chars": 44}, {"sum_logits": -35.52178192138672, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -44.55561065673828, "logits_per_token": -3.9468646579318576, "logits_per_char": -0.6965055278703278, "num_chars": 51}, {"sum_logits": -13.707928657531738, "num_tokens": 10, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -28.007951736450195, "logits_per_token": -1.3707928657531738, "logits_per_char": -0.2141863852739334, "num_chars": 64}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 850, "native_id": "Mercury_7228358", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.986539840698242, "incorrect_loss_raw": 27.44016456604004, "correct_loss_per_char": 0.42818685259137834, "incorrect_loss_per_char": 0.5864873035815762, "correct_loss_per_token": 2.4977566401163735, "incorrect_loss_per_token": 2.851549846154672, "correct_loss_uncond": -16.34410858154297, "incorrect_loss_uncond": -18.080319086710613}, "model_output": [{"sum_logits": -29.913726806640625, "num_tokens": 9, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -53.86166000366211, "logits_per_token": -3.3237474229600696, "logits_per_char": -0.5539579038266782, "num_chars": 54}, {"sum_logits": -20.728116989135742, "num_tokens": 8, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -38.714210510253906, "logits_per_token": -2.5910146236419678, "logits_per_char": -0.5314901792086087, "num_chars": 39}, {"sum_logits": -31.67864990234375, "num_tokens": 12, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -43.98558044433594, "logits_per_token": -2.639887491861979, "logits_per_char": -0.6740138277094415, "num_chars": 47}, {"sum_logits": -14.986539840698242, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -31.33064842224121, "logits_per_token": -2.4977566401163735, "logits_per_char": -0.42818685259137834, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 851, "native_id": "MCAS_2004_5_33", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.615522384643555, "incorrect_loss_raw": 24.39638551076253, "correct_loss_per_char": 0.5753233830134074, "incorrect_loss_per_char": 0.509066086703528, "correct_loss_per_token": 3.068391376071506, "incorrect_loss_per_token": 2.547701701916084, "correct_loss_uncond": -15.077390670776367, "incorrect_loss_uncond": -10.646351178487143}, "model_output": [{"sum_logits": -13.601790428161621, "num_tokens": 6, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -27.794145584106445, "logits_per_token": -2.26696507136027, "logits_per_char": -0.42505595088005066, "num_chars": 32}, {"sum_logits": -28.22146987915039, "num_tokens": 9, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -33.293540954589844, "logits_per_token": -3.1357188754611545, "logits_per_char": -0.5879472891489664, "num_chars": 48}, {"sum_logits": -27.615522384643555, "num_tokens": 9, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -42.69291305541992, "logits_per_token": -3.068391376071506, "logits_per_char": -0.5753233830134074, "num_chars": 48}, {"sum_logits": -31.365896224975586, "num_tokens": 14, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -44.040523529052734, "logits_per_token": -2.2404211589268277, "logits_per_char": -0.514195020081567, "num_chars": 61}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 852, "native_id": "Mercury_7008855", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.518336772918701, "incorrect_loss_raw": 14.07329273223877, "correct_loss_per_char": 0.1970834561756679, "incorrect_loss_per_char": 0.5210357868776293, "correct_loss_per_token": 1.1036673545837403, "incorrect_loss_per_token": 2.186270073481969, "correct_loss_uncond": -15.430426120758057, "incorrect_loss_uncond": -12.640110969543457}, "model_output": [{"sum_logits": -12.48801326751709, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -25.136920928955078, "logits_per_token": -2.497602653503418, "logits_per_char": -0.6244006633758545, "num_chars": 20}, {"sum_logits": -5.518336772918701, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -20.948762893676758, "logits_per_token": -1.1036673545837403, "logits_per_char": -0.1970834561756679, "num_chars": 28}, {"sum_logits": -19.304569244384766, "num_tokens": 7, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -31.456663131713867, "logits_per_token": -2.757795606340681, "logits_per_char": -0.622728040141444, "num_chars": 31}, {"sum_logits": -10.427295684814453, "num_tokens": 8, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -23.546627044677734, "logits_per_token": -1.3034119606018066, "logits_per_char": -0.3159786571155895, "num_chars": 33}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 853, "native_id": "Mercury_7057085", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 14.593374252319336, "incorrect_loss_raw": 13.277091344197592, "correct_loss_per_char": 0.5612836250892053, "incorrect_loss_per_char": 0.5498183096261849, "correct_loss_per_token": 2.4322290420532227, "incorrect_loss_per_token": 3.6099991268581815, "correct_loss_uncond": -12.575111389160156, "incorrect_loss_uncond": -11.211905797322592}, "model_output": [{"sum_logits": -12.443452835083008, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -24.790306091308594, "logits_per_token": -4.147817611694336, "logits_per_char": -0.518477201461792, "num_chars": 24}, {"sum_logits": -14.593374252319336, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -27.168485641479492, "logits_per_token": -2.4322290420532227, "logits_per_char": -0.5612836250892053, "num_chars": 26}, {"sum_logits": -18.35320472717285, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -27.83706283569336, "logits_per_token": -3.6706409454345703, "logits_per_char": -0.6554715973990304, "num_chars": 28}, {"sum_logits": -9.034616470336914, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -20.839622497558594, "logits_per_token": -3.011538823445638, "logits_per_char": -0.47550613001773234, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 854, "native_id": "Mercury_7171728", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.098575592041016, "incorrect_loss_raw": 23.15342140197754, "correct_loss_per_char": 3.619715118408203, "incorrect_loss_per_char": 4.375836881001791, "correct_loss_per_token": 9.049287796020508, "incorrect_loss_per_token": 10.255623605516222, "correct_loss_uncond": 2.1024551391601562, "incorrect_loss_uncond": 6.569403330485026}, "model_output": [{"sum_logits": -22.93626594543457, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -17.746299743652344, "logits_per_token": -11.468132972717285, "logits_per_char": -3.8227109909057617, "num_chars": 6}, {"sum_logits": -18.098575592041016, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -15.99612045288086, "logits_per_token": -9.049287796020508, "logits_per_char": -3.619715118408203, "num_chars": 5}, {"sum_logits": -22.744430541992188, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -15.411230087280273, "logits_per_token": -11.372215270996094, "logits_per_char": -4.548886108398437, "num_chars": 5}, {"sum_logits": -23.77956771850586, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.594524383544922, "logits_per_token": -7.926522572835286, "logits_per_char": -4.755913543701172, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 855, "native_id": "NAEP_2005_4_S14+3", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 73.22401428222656, "incorrect_loss_raw": 46.33436075846354, "correct_loss_per_char": 0.3793990377317438, "incorrect_loss_per_char": 0.3442008887923447, "correct_loss_per_token": 1.6272003173828125, "incorrect_loss_per_token": 1.6507463696086877, "correct_loss_uncond": -50.22984313964844, "incorrect_loss_uncond": -42.85740407307943}, "model_output": [{"sum_logits": -48.595611572265625, "num_tokens": 35, "num_tokens_all": 244, "is_greedy": false, "sum_logits_uncond": -104.80728149414062, "logits_per_token": -1.388446044921875, "logits_per_char": -0.33058239164806547, "num_chars": 147}, {"sum_logits": -43.142181396484375, "num_tokens": 23, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -67.54086303710938, "logits_per_token": -1.8757470172384512, "logits_per_char": -0.3595181783040365, "num_chars": 120}, {"sum_logits": -73.22401428222656, "num_tokens": 45, "num_tokens_all": 254, "is_greedy": false, "sum_logits_uncond": -123.453857421875, "logits_per_token": -1.6272003173828125, "logits_per_char": -0.3793990377317438, "num_chars": 193}, {"sum_logits": -47.265289306640625, "num_tokens": 28, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -95.2271499633789, "logits_per_token": -1.6880460466657365, "logits_per_char": -0.34250209642493207, "num_chars": 138}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 856, "native_id": "Mercury_7024395", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.718104362487793, "incorrect_loss_raw": 11.881746292114258, "correct_loss_per_char": 0.24897110846734816, "incorrect_loss_per_char": 0.29585192820198747, "correct_loss_per_token": 1.5436208724975586, "incorrect_loss_per_token": 1.5549195951885648, "correct_loss_uncond": -15.7805814743042, "incorrect_loss_uncond": -27.954688390096027}, "model_output": [{"sum_logits": -7.718104362487793, "num_tokens": 5, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -23.498685836791992, "logits_per_token": -1.5436208724975586, "logits_per_char": -0.24897110846734816, "num_chars": 31}, {"sum_logits": -12.249528884887695, "num_tokens": 6, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -41.087242126464844, "logits_per_token": -2.0415881474812827, "logits_per_char": -0.3402646912468804, "num_chars": 36}, {"sum_logits": -11.343985557556152, "num_tokens": 8, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -40.64708709716797, "logits_per_token": -1.417998194694519, "logits_per_char": -0.29087142455272186, "num_chars": 39}, {"sum_logits": -12.051724433898926, "num_tokens": 10, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -37.77497482299805, "logits_per_token": -1.2051724433898925, "logits_per_char": -0.25641966880636013, "num_chars": 47}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 857, "native_id": "NYSEDREGENTS_2012_8_28", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 34.63617706298828, "incorrect_loss_raw": 35.1952870686849, "correct_loss_per_char": 0.6185031618390765, "incorrect_loss_per_char": 0.6129017624869252, "correct_loss_per_token": 2.886348088582357, "incorrect_loss_per_token": 2.7750101497030664, "correct_loss_uncond": -11.601356506347656, "incorrect_loss_uncond": -14.1164182027181}, "model_output": [{"sum_logits": -31.674415588378906, "num_tokens": 12, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -45.72919464111328, "logits_per_token": -2.6395346323649087, "logits_per_char": -0.5865632516366465, "num_chars": 54}, {"sum_logits": -35.29521942138672, "num_tokens": 13, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -50.70185852050781, "logits_per_token": -2.715016878568209, "logits_per_char": -0.6085382658859779, "num_chars": 58}, {"sum_logits": -34.63617706298828, "num_tokens": 12, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -46.23753356933594, "logits_per_token": -2.886348088582357, "logits_per_char": -0.6185031618390765, "num_chars": 56}, {"sum_logits": -38.61622619628906, "num_tokens": 13, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -51.50406265258789, "logits_per_token": -2.9704789381760817, "logits_per_char": -0.643603769938151, "num_chars": 60}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 858, "native_id": "Mercury_7090790", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.874041557312012, "incorrect_loss_raw": 12.105579694112143, "correct_loss_per_char": 0.270780086517334, "incorrect_loss_per_char": 1.0245590134272498, "correct_loss_per_token": 2.437020778656006, "incorrect_loss_per_token": 5.0561891661749945, "correct_loss_uncond": -12.782439231872559, "incorrect_loss_uncond": -9.733091672261557}, "model_output": [{"sum_logits": -4.874041557312012, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -17.65648078918457, "logits_per_token": -2.437020778656006, "logits_per_char": -0.270780086517334, "num_chars": 18}, {"sum_logits": -4.719822883605957, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -17.06918716430664, "logits_per_token": -2.3599114418029785, "logits_per_char": -0.33713020597185406, "num_chars": 14}, {"sum_logits": -13.658103942871094, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -22.288681030273438, "logits_per_token": -6.829051971435547, "logits_per_char": -1.2416458129882812, "num_chars": 11}, {"sum_logits": -17.938812255859375, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.158145904541016, "logits_per_token": -5.979604085286458, "logits_per_char": -1.4949010213216145, "num_chars": 12}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 859, "native_id": "TIMSS_2003_8_pg87", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.958202838897705, "incorrect_loss_raw": 5.527944087982178, "correct_loss_per_char": 0.30988767743110657, "incorrect_loss_per_char": 0.3471266939749125, "correct_loss_per_token": 1.2395507097244263, "incorrect_loss_per_token": 1.3819860219955444, "correct_loss_uncond": -15.3852858543396, "incorrect_loss_uncond": -14.098718484242758}, "model_output": [{"sum_logits": -5.640619277954102, "num_tokens": 4, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -20.82700538635254, "logits_per_token": -1.4101548194885254, "logits_per_char": -0.3760412851969401, "num_chars": 15}, {"sum_logits": -4.958202838897705, "num_tokens": 4, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -20.343488693237305, "logits_per_token": -1.2395507097244263, "logits_per_char": -0.30988767743110657, "num_chars": 16}, {"sum_logits": -5.880744934082031, "num_tokens": 4, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -18.939891815185547, "logits_per_token": -1.4701862335205078, "logits_per_char": -0.36754655838012695, "num_chars": 16}, {"sum_logits": -5.0624680519104, "num_tokens": 4, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -19.11309051513672, "logits_per_token": -1.2656170129776, "logits_per_char": -0.2977922383476706, "num_chars": 17}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 860, "native_id": "Mercury_SC_407382", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.279451370239258, "incorrect_loss_raw": 11.588235219319662, "correct_loss_per_char": 0.2577625380622016, "incorrect_loss_per_char": 0.40336967680189345, "correct_loss_per_token": 1.3256359100341797, "incorrect_loss_per_token": 1.8793609301249188, "correct_loss_uncond": -18.53544807434082, "incorrect_loss_uncond": -14.773489634195963}, "model_output": [{"sum_logits": -10.827136993408203, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -24.172115325927734, "logits_per_token": -2.165427398681641, "logits_per_char": -0.5155779520670573, "num_chars": 21}, {"sum_logits": -6.406124114990234, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.55557632446289, "logits_per_token": -1.281224822998047, "logits_per_char": -0.2562449645996094, "num_chars": 25}, {"sum_logits": -9.279451370239258, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -27.814899444580078, "logits_per_token": -1.3256359100341797, "logits_per_char": -0.2577625380622016, "num_chars": 36}, {"sum_logits": -17.531444549560547, "num_tokens": 8, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.35748291015625, "logits_per_token": -2.1914305686950684, "logits_per_char": -0.43828611373901366, "num_chars": 40}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 861, "native_id": "MDSA_2010_4_20", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.844017505645752, "incorrect_loss_raw": 8.965753873189291, "correct_loss_per_char": 0.3602114476655659, "incorrect_loss_per_char": 0.4151675557333326, "correct_loss_per_token": 2.281339168548584, "incorrect_loss_per_token": 2.1899588902791343, "correct_loss_uncond": -13.995604991912842, "incorrect_loss_uncond": -13.844874382019043}, "model_output": [{"sum_logits": -8.928182601928711, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.06952667236328, "logits_per_token": -2.9760608673095703, "logits_per_char": -0.6377273287091937, "num_chars": 14}, {"sum_logits": -6.844017505645752, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -20.839622497558594, "logits_per_token": -2.281339168548584, "logits_per_char": -0.3602114476655659, "num_chars": 19}, {"sum_logits": -7.661252975463867, "num_tokens": 5, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -21.434616088867188, "logits_per_token": -1.5322505950927734, "logits_per_char": -0.26418113708496094, "num_chars": 29}, {"sum_logits": -10.307826042175293, "num_tokens": 5, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.92774200439453, "logits_per_token": -2.0615652084350584, "logits_per_char": -0.3435942014058431, "num_chars": 30}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 862, "native_id": "Mercury_SC_405019", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.907222747802734, "incorrect_loss_raw": 15.827837626139322, "correct_loss_per_char": 0.234676777232777, "incorrect_loss_per_char": 0.3257483545663897, "correct_loss_per_token": 1.2907222747802733, "incorrect_loss_per_token": 1.756771676640146, "correct_loss_uncond": -18.048263549804688, "incorrect_loss_uncond": -20.52255376180013}, "model_output": [{"sum_logits": -16.155372619628906, "num_tokens": 9, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -36.46900939941406, "logits_per_token": -1.7950414021809895, "logits_per_char": -0.3590082804361979, "num_chars": 45}, {"sum_logits": -12.074771881103516, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -28.983470916748047, "logits_per_token": -1.7249674115862166, "logits_per_char": -0.27442663366144354, "num_chars": 44}, {"sum_logits": -19.253368377685547, "num_tokens": 11, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -43.59869384765625, "logits_per_token": -1.7503062161532315, "logits_per_char": -0.3438101496015276, "num_chars": 56}, {"sum_logits": -12.907222747802734, "num_tokens": 10, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -30.955486297607422, "logits_per_token": -1.2907222747802733, "logits_per_char": -0.234676777232777, "num_chars": 55}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 863, "native_id": "Mercury_7123078", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.8444242477417, "incorrect_loss_raw": 16.983167966206867, "correct_loss_per_char": 0.20461177825927734, "incorrect_loss_per_char": 0.428988655921702, "correct_loss_per_token": 1.08444242477417, "incorrect_loss_per_token": 2.0886021896644875, "correct_loss_uncond": -21.730908393859863, "incorrect_loss_uncond": -15.47144858042399}, "model_output": [{"sum_logits": -10.885510444641113, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.825557708740234, "logits_per_token": -1.8142517407735188, "logits_per_char": -0.3753624291255556, "num_chars": 29}, {"sum_logits": -20.431415557861328, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -32.991127014160156, "logits_per_token": -2.2701572842068143, "logits_per_char": -0.4441612077795941, "num_chars": 46}, {"sum_logits": -19.632577896118164, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -36.54716491699219, "logits_per_token": -2.1813975440131292, "logits_per_char": -0.4674423308599563, "num_chars": 42}, {"sum_logits": -10.8444242477417, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.57533264160156, "logits_per_token": -1.08444242477417, "logits_per_char": -0.20461177825927734, "num_chars": 53}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 864, "native_id": "Mercury_400084", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.163706064224243, "incorrect_loss_raw": 2.878461996714274, "correct_loss_per_char": 1.0818530321121216, "incorrect_loss_per_char": 1.439230998357137, "correct_loss_per_token": 2.163706064224243, "incorrect_loss_per_token": 2.878461996714274, "correct_loss_uncond": -3.6141021251678467, "incorrect_loss_uncond": -2.2580865224202475}, "model_output": [{"sum_logits": -3.1846389770507812, "num_tokens": 1, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -4.448035717010498, "logits_per_token": -3.1846389770507812, "logits_per_char": -1.5923194885253906, "num_chars": 2}, {"sum_logits": -2.3285529613494873, "num_tokens": 1, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -5.410497665405273, "logits_per_token": -2.3285529613494873, "logits_per_char": -1.1642764806747437, "num_chars": 2}, {"sum_logits": -3.1221940517425537, "num_tokens": 1, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -5.551112174987793, "logits_per_token": -3.1221940517425537, "logits_per_char": -1.5610970258712769, "num_chars": 2}, {"sum_logits": -2.163706064224243, "num_tokens": 1, "num_tokens_all": 232, "is_greedy": true, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -2.163706064224243, "logits_per_char": -1.0818530321121216, "num_chars": 2}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 865, "native_id": "Mercury_7139650", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.5767571926116943, "incorrect_loss_raw": 3.0260624090830484, "correct_loss_per_char": 0.3576757192611694, "incorrect_loss_per_char": 0.2934013080029261, "correct_loss_per_token": 3.5767571926116943, "incorrect_loss_per_token": 3.0260624090830484, "correct_loss_uncond": -8.964395761489868, "incorrect_loss_uncond": -10.725183725357056}, "model_output": [{"sum_logits": -4.7519636154174805, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -13.370038986206055, "logits_per_token": -4.7519636154174805, "logits_per_char": -0.33942597252982004, "num_chars": 14}, {"sum_logits": -0.7007806301116943, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": true, "sum_logits_uncond": -14.619657516479492, "logits_per_token": -0.7007806301116943, "logits_per_char": -0.08759757876396179, "num_chars": 8}, {"sum_logits": -3.5767571926116943, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -12.541152954101562, "logits_per_token": -3.5767571926116943, "logits_per_char": -0.3576757192611694, "num_chars": 10}, {"sum_logits": -3.6254429817199707, "num_tokens": 1, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -13.264041900634766, "logits_per_token": -3.6254429817199707, "logits_per_char": -0.45318037271499634, "num_chars": 8}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 866, "native_id": "Mercury_417150", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.378020286560059, "incorrect_loss_raw": 6.96543820699056, "correct_loss_per_char": 0.14593400955200195, "incorrect_loss_per_char": 0.22371418669703802, "correct_loss_per_token": 1.4593400955200195, "incorrect_loss_per_token": 1.7985772821638317, "correct_loss_uncond": -15.66223430633545, "incorrect_loss_uncond": -15.291325251261393}, "model_output": [{"sum_logits": -7.454498291015625, "num_tokens": 4, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -23.31460189819336, "logits_per_token": -1.8636245727539062, "logits_per_char": -0.23295307159423828, "num_chars": 32}, {"sum_logits": -4.378020286560059, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -20.040254592895508, "logits_per_token": -1.4593400955200195, "logits_per_char": -0.14593400955200195, "num_chars": 30}, {"sum_logits": -2.05983829498291, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -18.088218688964844, "logits_per_token": -0.6866127649943033, "logits_per_char": -0.07102890672354863, "num_chars": 29}, {"sum_logits": -11.381978034973145, "num_tokens": 4, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -25.367469787597656, "logits_per_token": -2.845494508743286, "logits_per_char": -0.3671605817733272, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 867, "native_id": "Mercury_SC_402256", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.216564416885376, "incorrect_loss_raw": 7.4241682688395185, "correct_loss_per_char": 0.45950920241219656, "incorrect_loss_per_char": 1.3169259707132974, "correct_loss_per_token": 3.216564416885376, "incorrect_loss_per_token": 6.230699380238851, "correct_loss_uncond": -11.42209792137146, "incorrect_loss_uncond": -7.761074066162109}, "model_output": [{"sum_logits": -7.160813331604004, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -17.619918823242188, "logits_per_token": -3.580406665802002, "logits_per_char": -1.4321626663208007, "num_chars": 5}, {"sum_logits": -7.951506614685059, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -14.791807174682617, "logits_per_token": -7.951506614685059, "logits_per_char": -1.3252511024475098, "num_chars": 6}, {"sum_logits": -3.216564416885376, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -14.638662338256836, "logits_per_token": -3.216564416885376, "logits_per_char": -0.45950920241219656, "num_chars": 7}, {"sum_logits": -7.160184860229492, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -13.144001007080078, "logits_per_token": -7.160184860229492, "logits_per_char": -1.193364143371582, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 868, "native_id": "TIMSS_2007_8_pg53", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.318803787231445, "incorrect_loss_raw": 6.8973764181137085, "correct_loss_per_char": 0.354586919148763, "incorrect_loss_per_char": 0.6129842609421821, "correct_loss_per_token": 2.6594018936157227, "incorrect_loss_per_token": 3.4486882090568542, "correct_loss_uncond": -9.173079490661621, "incorrect_loss_uncond": -8.195218920707703}, "model_output": [{"sum_logits": -5.318803787231445, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -14.491883277893066, "logits_per_token": -2.6594018936157227, "logits_per_char": -0.354586919148763, "num_chars": 15}, {"sum_logits": -6.1945600509643555, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -16.111740112304688, "logits_per_token": -3.0972800254821777, "logits_per_char": -0.4765046193049504, "num_chars": 13}, {"sum_logits": -6.18088960647583, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -15.121659278869629, "logits_per_token": -3.090444803237915, "logits_per_char": -0.47545304665198695, "num_chars": 13}, {"sum_logits": -9.642770767211914, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -14.291642189025879, "logits_per_token": -4.821385383605957, "logits_per_char": -1.0714189741346571, "num_chars": 9}, {"sum_logits": -5.571285247802734, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -14.84533977508545, "logits_per_token": -2.785642623901367, "logits_per_char": -0.42856040367713344, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 869, "native_id": "MCAS_2006_9_17-v1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.033809661865234, "incorrect_loss_raw": 20.289051691691082, "correct_loss_per_char": 0.457256731779679, "incorrect_loss_per_char": 0.4849200822304988, "correct_loss_per_token": 2.1033809661865233, "incorrect_loss_per_token": 2.1783327314588754, "correct_loss_uncond": -29.77237319946289, "incorrect_loss_uncond": -26.3665345509847}, "model_output": [{"sum_logits": -20.205974578857422, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -46.727943420410156, "logits_per_token": -2.2451082865397134, "logits_per_char": -0.5051493644714355, "num_chars": 40}, {"sum_logits": -20.139467239379883, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -45.33361053466797, "logits_per_token": -2.2377185821533203, "logits_per_char": -0.5034866809844971, "num_chars": 40}, {"sum_logits": -20.521713256835938, "num_tokens": 10, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -47.90520477294922, "logits_per_token": -2.052171325683594, "logits_per_char": -0.44612420123556384, "num_chars": 46}, {"sum_logits": -21.033809661865234, "num_tokens": 10, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -50.806182861328125, "logits_per_token": -2.1033809661865233, "logits_per_char": -0.457256731779679, "num_chars": 46}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 870, "native_id": "Mercury_401728", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.983911514282227, "incorrect_loss_raw": 21.47049903869629, "correct_loss_per_char": 0.7091584359445879, "incorrect_loss_per_char": 0.8759810997307373, "correct_loss_per_token": 3.1405587877546037, "incorrect_loss_per_token": 4.1892607476976185, "correct_loss_uncond": -12.662614822387695, "incorrect_loss_uncond": -9.800858815511068}, "model_output": [{"sum_logits": -23.93019676208496, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -34.42792510986328, "logits_per_token": -3.98836612701416, "logits_per_char": -0.9203921831571139, "num_chars": 26}, {"sum_logits": -21.990392684936523, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -31.602006912231445, "logits_per_token": -5.497598171234131, "logits_per_char": -1.0471615564255488, "num_chars": 21}, {"sum_logits": -18.490907669067383, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -27.784141540527344, "logits_per_token": -3.081817944844564, "logits_per_char": -0.6603895596095494, "num_chars": 28}, {"sum_logits": -21.983911514282227, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.64652633666992, "logits_per_token": -3.1405587877546037, "logits_per_char": -0.7091584359445879, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 871, "native_id": "Mercury_7192798", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.100167274475098, "incorrect_loss_raw": 8.856117884318033, "correct_loss_per_char": 0.5062604546546936, "incorrect_loss_per_char": 0.5762615336312189, "correct_loss_per_token": 4.050083637237549, "incorrect_loss_per_token": 4.428058942159017, "correct_loss_uncond": -7.69626522064209, "incorrect_loss_uncond": -9.39764404296875}, "model_output": [{"sum_logits": -6.869256973266602, "num_tokens": 2, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -16.36313247680664, "logits_per_token": -3.434628486633301, "logits_per_char": -0.5724380811055502, "num_chars": 12}, {"sum_logits": -8.921125411987305, "num_tokens": 2, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.02826690673828, "logits_per_token": -4.460562705993652, "logits_per_char": -0.5575703382492065, "num_chars": 16}, {"sum_logits": -8.100167274475098, "num_tokens": 2, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -15.796432495117188, "logits_per_token": -4.050083637237549, "logits_per_char": -0.5062604546546936, "num_chars": 16}, {"sum_logits": -10.777971267700195, "num_tokens": 2, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -19.36988639831543, "logits_per_token": -5.388985633850098, "logits_per_char": -0.5987761815388998, "num_chars": 18}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 872, "native_id": "Mercury_7221078", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 19.097063064575195, "incorrect_loss_raw": 25.149893442789715, "correct_loss_per_char": 0.47742657661437987, "incorrect_loss_per_char": 0.6309246692641536, "correct_loss_per_token": 2.728151866367885, "incorrect_loss_per_token": 3.431332361130487, "correct_loss_uncond": -21.157663345336914, "incorrect_loss_uncond": -13.780010223388672}, "model_output": [{"sum_logits": -27.13360595703125, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -41.903709411621094, "logits_per_token": -3.3917007446289062, "logits_per_char": -0.7140422620271382, "num_chars": 38}, {"sum_logits": -23.81317901611328, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.404022216796875, "logits_per_token": -3.4018827165876115, "logits_per_char": -0.595329475402832, "num_chars": 40}, {"sum_logits": -19.097063064575195, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.25472640991211, "logits_per_token": -2.728151866367885, "logits_per_char": -0.47742657661437987, "num_chars": 40}, {"sum_logits": -24.50289535522461, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -37.48197937011719, "logits_per_token": -3.500413622174944, "logits_per_char": -0.5834022703624907, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 873, "native_id": "Mercury_7004953", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 31.484098434448242, "incorrect_loss_raw": 36.17173385620117, "correct_loss_per_char": 0.7321883356848429, "incorrect_loss_per_char": 0.7832478169108263, "correct_loss_per_token": 3.9355123043060303, "incorrect_loss_per_token": 4.5214667320251465, "correct_loss_uncond": -11.89137077331543, "incorrect_loss_uncond": -10.512893676757812}, "model_output": [{"sum_logits": -36.24367904663086, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -45.18961715698242, "logits_per_token": -4.530459880828857, "logits_per_char": -0.8428762568983921, "num_chars": 43}, {"sum_logits": -31.484098434448242, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -43.37546920776367, "logits_per_token": -3.9355123043060303, "logits_per_char": -0.7321883356848429, "num_chars": 43}, {"sum_logits": -36.77679443359375, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -47.78376770019531, "logits_per_token": -4.597099304199219, "logits_per_char": -0.7824849879488032, "num_chars": 47}, {"sum_logits": -35.494728088378906, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -47.08049774169922, "logits_per_token": -4.436841011047363, "logits_per_char": -0.7243822058852838, "num_chars": 49}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 874, "native_id": "TIMSS_2003_8_pg94", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.67337703704834, "incorrect_loss_raw": 9.49394957224528, "correct_loss_per_char": 1.1346754074096679, "incorrect_loss_per_char": 1.4464390164329892, "correct_loss_per_token": 5.67337703704834, "incorrect_loss_per_token": 9.49394957224528, "correct_loss_uncond": -4.8531389236450195, "incorrect_loss_uncond": -3.7075344721476235}, "model_output": [{"sum_logits": -11.360223770141602, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.411972045898438, "logits_per_token": -11.360223770141602, "logits_per_char": -1.8933706283569336, "num_chars": 6}, {"sum_logits": -7.602103233337402, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -12.78154182434082, "logits_per_token": -7.602103233337402, "logits_per_char": -1.086014747619629, "num_chars": 7}, {"sum_logits": -9.519521713256836, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.410938262939453, "logits_per_token": -9.519521713256836, "logits_per_char": -1.3599316733224052, "num_chars": 7}, {"sum_logits": -5.67337703704834, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.52651596069336, "logits_per_token": -5.67337703704834, "logits_per_char": -1.1346754074096679, "num_chars": 5}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 875, "native_id": "Mercury_7095060", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.085336208343506, "incorrect_loss_raw": 11.724031130472818, "correct_loss_per_char": 0.35796095343197093, "incorrect_loss_per_char": 0.488043275756551, "correct_loss_per_token": 2.0284454027811685, "incorrect_loss_per_token": 2.5757110383775497, "correct_loss_uncond": -14.919561862945557, "incorrect_loss_uncond": -15.499283154805502}, "model_output": [{"sum_logits": -6.085336208343506, "num_tokens": 3, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -21.004898071289062, "logits_per_token": -2.0284454027811685, "logits_per_char": -0.35796095343197093, "num_chars": 17}, {"sum_logits": -5.1953582763671875, "num_tokens": 3, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -19.831417083740234, "logits_per_token": -1.7317860921223958, "logits_per_char": -0.3056093103745404, "num_chars": 17}, {"sum_logits": -14.771036148071289, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -27.84556007385254, "logits_per_token": -2.9542072296142576, "logits_per_char": -0.6154598395029703, "num_chars": 24}, {"sum_logits": -15.20569896697998, "num_tokens": 5, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -33.99296569824219, "logits_per_token": -3.0411397933959963, "logits_per_char": -0.5430606773921421, "num_chars": 28}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 876, "native_id": "Mercury_7123358", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.456857681274414, "incorrect_loss_raw": 15.798906008402506, "correct_loss_per_char": 0.6151092753690832, "incorrect_loss_per_char": 0.6195075677500831, "correct_loss_per_token": 2.091371536254883, "incorrect_loss_per_token": 2.9921643469068737, "correct_loss_uncond": -15.44792366027832, "incorrect_loss_uncond": -14.903661410013834}, "model_output": [{"sum_logits": -10.456857681274414, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -25.904781341552734, "logits_per_token": -2.091371536254883, "logits_per_char": -0.6151092753690832, "num_chars": 17}, {"sum_logits": -15.526296615600586, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -26.933286666870117, "logits_per_token": -3.105259323120117, "logits_per_char": -0.8625720342000326, "num_chars": 18}, {"sum_logits": -15.085516929626465, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.238712310791016, "logits_per_token": -2.5142528216044107, "logits_per_char": -0.471422404050827, "num_chars": 32}, {"sum_logits": -16.78490447998047, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -27.93570327758789, "logits_per_token": -3.356980895996094, "logits_per_char": -0.5245282649993896, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 877, "native_id": "Mercury_7069020", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.4826781749725342, "incorrect_loss_raw": 7.546861489613851, "correct_loss_per_char": 0.1647420194413927, "incorrect_loss_per_char": 0.6335795378243482, "correct_loss_per_token": 0.7413390874862671, "incorrect_loss_per_token": 4.868717140621609, "correct_loss_uncond": -12.295562028884888, "incorrect_loss_uncond": -7.811243216196696}, "model_output": [{"sum_logits": -1.4826781749725342, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": true, "sum_logits_uncond": -13.778240203857422, "logits_per_token": -0.7413390874862671, "logits_per_char": -0.1647420194413927, "num_chars": 9}, {"sum_logits": -10.588934898376465, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.479413986206055, "logits_per_token": -10.588934898376465, "logits_per_char": -1.1765483220418294, "num_chars": 9}, {"sum_logits": -7.870205402374268, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.543603897094727, "logits_per_token": -2.6234018007914224, "logits_per_char": -0.4918878376483917, "num_chars": 16}, {"sum_logits": -4.18144416809082, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.05129623413086, "logits_per_token": -1.3938147226969402, "logits_per_char": -0.23230245378282335, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 878, "native_id": "TIMSS_2003_8_pg117", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.902910232543945, "incorrect_loss_raw": 9.08020273844401, "correct_loss_per_char": 0.3961164093017578, "incorrect_loss_per_char": 0.6978821698262635, "correct_loss_per_token": 2.4757275581359863, "incorrect_loss_per_token": 3.9783160527547197, "correct_loss_uncond": -13.29224967956543, "incorrect_loss_uncond": -10.727528889973959}, "model_output": [{"sum_logits": -5.617853164672852, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -22.705020904541016, "logits_per_token": -1.1235706329345703, "logits_per_char": -0.20806863572862414, "num_chars": 27}, {"sum_logits": -9.902910232543945, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.195159912109375, "logits_per_token": -2.4757275581359863, "logits_per_char": -0.3961164093017578, "num_chars": 25}, {"sum_logits": -10.576781272888184, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -20.325668334960938, "logits_per_token": -5.288390636444092, "logits_per_char": -0.8813984394073486, "num_chars": 12}, {"sum_logits": -11.045973777770996, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.392505645751953, "logits_per_token": -5.522986888885498, "logits_per_char": -1.0041794343428179, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 879, "native_id": "VASoL_2008_3_32", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.666560649871826, "incorrect_loss_raw": 12.219638188680014, "correct_loss_per_char": 0.30554672082265216, "incorrect_loss_per_char": 0.7684059670180884, "correct_loss_per_token": 1.2221868832906086, "incorrect_loss_per_token": 3.452461030748155, "correct_loss_uncond": -14.914873600006104, "incorrect_loss_uncond": -9.363794326782227}, "model_output": [{"sum_logits": -17.426626205444336, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -18.720905303955078, "logits_per_token": -5.808875401814778, "logits_per_char": -1.3405097081111028, "num_chars": 13}, {"sum_logits": -3.666560649871826, "num_tokens": 3, "num_tokens_all": 199, "is_greedy": true, "sum_logits_uncond": -18.58143424987793, "logits_per_token": -1.2221868832906086, "logits_per_char": -0.30554672082265216, "num_chars": 12}, {"sum_logits": -14.041000366210938, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -25.618053436279297, "logits_per_token": -3.5102500915527344, "logits_per_char": -0.7390000192742598, "num_chars": 19}, {"sum_logits": -5.191287994384766, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -20.411338806152344, "logits_per_token": -1.0382575988769531, "logits_per_char": -0.22570817366890286, "num_chars": 23}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 880, "native_id": "Mercury_SC_400142", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.723502159118652, "incorrect_loss_raw": 11.207701683044434, "correct_loss_per_char": 0.5117632715325606, "incorrect_loss_per_char": 0.6528090713850033, "correct_loss_per_token": 3.2411673863728843, "incorrect_loss_per_token": 2.8019254207611084, "correct_loss_uncond": -14.148070335388184, "incorrect_loss_uncond": -12.685531934102377}, "model_output": [{"sum_logits": -9.723502159118652, "num_tokens": 3, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -23.871572494506836, "logits_per_token": -3.2411673863728843, "logits_per_char": -0.5117632715325606, "num_chars": 19}, {"sum_logits": -5.937163352966309, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -24.611101150512695, "logits_per_token": -1.4842908382415771, "logits_per_char": -0.32984240849812824, "num_chars": 18}, {"sum_logits": -14.339540481567383, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.819679260253906, "logits_per_token": -3.5848851203918457, "logits_per_char": -0.8435023812686696, "num_chars": 17}, {"sum_logits": -13.34640121459961, "num_tokens": 4, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.248920440673828, "logits_per_token": -3.3366003036499023, "logits_per_char": -0.7850824243882123, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 881, "native_id": "Mercury_7163818", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.797358751296997, "incorrect_loss_raw": 9.164674758911133, "correct_loss_per_char": 0.31644656260808307, "incorrect_loss_per_char": 0.7965166752155012, "correct_loss_per_token": 1.8986793756484985, "incorrect_loss_per_token": 4.582337379455566, "correct_loss_uncond": -13.897255659103394, "incorrect_loss_uncond": -5.308825810750325}, "model_output": [{"sum_logits": -8.08102798461914, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.711383819580078, "logits_per_token": -4.04051399230957, "logits_per_char": -0.6734189987182617, "num_chars": 12}, {"sum_logits": -9.65569019317627, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -14.548185348510742, "logits_per_token": -4.827845096588135, "logits_per_char": -0.965569019317627, "num_chars": 10}, {"sum_logits": -9.757306098937988, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -15.160932540893555, "logits_per_token": -4.878653049468994, "logits_per_char": -0.7505620076106145, "num_chars": 13}, {"sum_logits": -3.797358751296997, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -17.69461441040039, "logits_per_token": -1.8986793756484985, "logits_per_char": -0.31644656260808307, "num_chars": 12}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 882, "native_id": "Mercury_402502", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.734100341796875, "incorrect_loss_raw": 21.91115125020345, "correct_loss_per_char": 3.2167625427246094, "incorrect_loss_per_char": 2.9839900107610795, "correct_loss_per_token": 5.146820068359375, "incorrect_loss_per_token": 4.382230250040689, "correct_loss_uncond": 0.41504669189453125, "incorrect_loss_uncond": -0.4745006561279297}, "model_output": [{"sum_logits": -21.178844451904297, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -21.965076446533203, "logits_per_token": -4.235768890380859, "logits_per_char": -3.0255492074148997, "num_chars": 7}, {"sum_logits": -19.99730110168457, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -21.097412109375, "logits_per_token": -3.999460220336914, "logits_per_char": -2.8567573002406528, "num_chars": 7}, {"sum_logits": -24.557308197021484, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -24.094467163085938, "logits_per_token": -4.911461639404297, "logits_per_char": -3.0696635246276855, "num_chars": 8}, {"sum_logits": -25.734100341796875, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.319053649902344, "logits_per_token": -5.146820068359375, "logits_per_char": -3.2167625427246094, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 883, "native_id": "Mercury_7130778", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.138911247253418, "incorrect_loss_raw": 10.098378976186117, "correct_loss_per_char": 0.3655564498901367, "incorrect_loss_per_char": 0.455904026566182, "correct_loss_per_token": 1.8277822494506837, "incorrect_loss_per_token": 2.3415869909619533, "correct_loss_uncond": -16.368739128112793, "incorrect_loss_uncond": -12.306296189626059}, "model_output": [{"sum_logits": -6.429296016693115, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -16.34406852722168, "logits_per_token": -2.1430986722310386, "logits_per_char": -0.338384000878585, "num_chars": 19}, {"sum_logits": -13.741060256958008, "num_tokens": 4, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -26.665203094482422, "logits_per_token": -3.435265064239502, "logits_per_char": -0.6543362027122861, "num_chars": 21}, {"sum_logits": -9.138911247253418, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -25.50765037536621, "logits_per_token": -1.8277822494506837, "logits_per_char": -0.3655564498901367, "num_chars": 25}, {"sum_logits": -10.124780654907227, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -24.204753875732422, "logits_per_token": -1.4463972364153181, "logits_per_char": -0.37499187610767504, "num_chars": 27}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 884, "native_id": "MEA_2010_8_18", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.34848403930664, "incorrect_loss_raw": 19.835778554280598, "correct_loss_per_char": 0.5694752204708937, "incorrect_loss_per_char": 0.4453168053021006, "correct_loss_per_token": 2.91856050491333, "incorrect_loss_per_token": 2.0316189231294572, "correct_loss_uncond": -21.55916976928711, "incorrect_loss_uncond": -14.592713673909506}, "model_output": [{"sum_logits": -14.365537643432617, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -30.45807647705078, "logits_per_token": -1.7956922054290771, "logits_per_char": -0.4225158130421358, "num_chars": 34}, {"sum_logits": -21.490121841430664, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.541603088378906, "logits_per_token": -2.1490121841430665, "logits_per_char": -0.4671765617702318, "num_chars": 46}, {"sum_logits": -23.34848403930664, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -44.90765380859375, "logits_per_token": -2.91856050491333, "logits_per_char": -0.5694752204708937, "num_chars": 41}, {"sum_logits": -23.651676177978516, "num_tokens": 11, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -37.285797119140625, "logits_per_token": -2.1501523798162285, "logits_per_char": -0.44625804109393424, "num_chars": 53}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 885, "native_id": "Mercury_7211033", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 3.108058214187622, "incorrect_loss_raw": 4.135530153910319, "correct_loss_per_char": 0.38850727677345276, "incorrect_loss_per_char": 0.47189345977924485, "correct_loss_per_token": 3.108058214187622, "incorrect_loss_per_token": 3.6856641372044883, "correct_loss_uncond": -9.78259539604187, "incorrect_loss_uncond": -7.920655250549316}, "model_output": [{"sum_logits": -2.6991961002349854, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -1.3495980501174927, "logits_per_char": -0.17994640668233236, "num_chars": 15}, {"sum_logits": -3.108058214187622, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.890653610229492, "logits_per_token": -3.108058214187622, "logits_per_char": -0.38850727677345276, "num_chars": 8}, {"sum_logits": -2.828422784805298, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -2.828422784805298, "logits_per_char": -0.4714037974675496, "num_chars": 6}, {"sum_logits": -6.878971576690674, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -6.878971576690674, "logits_per_char": -0.7643301751878526, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 886, "native_id": "NYSEDREGENTS_2008_8_17", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.658863067626953, "incorrect_loss_raw": 18.06889247894287, "correct_loss_per_char": 0.38872631390889484, "incorrect_loss_per_char": 0.41170857951915657, "correct_loss_per_token": 2.332357883453369, "incorrect_loss_per_token": 2.258611559867859, "correct_loss_uncond": -15.561698913574219, "incorrect_loss_uncond": -10.41495164235433}, "model_output": [{"sum_logits": -18.658863067626953, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.22056198120117, "logits_per_token": -2.332357883453369, "logits_per_char": -0.38872631390889484, "num_chars": 48}, {"sum_logits": -11.871638298034668, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.78802490234375, "logits_per_token": -1.4839547872543335, "logits_per_char": -0.2698099613189697, "num_chars": 44}, {"sum_logits": -16.84269905090332, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -29.427927017211914, "logits_per_token": -2.105337381362915, "logits_per_char": -0.3583552989553898, "num_chars": 47}, {"sum_logits": -25.492340087890625, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -32.23558044433594, "logits_per_token": -3.186542510986328, "logits_per_char": -0.6069604782831102, "num_chars": 42}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 887, "native_id": "NAEP_2005_8_S11+1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.831933975219727, "incorrect_loss_raw": 18.772562980651855, "correct_loss_per_char": 0.5154061847262912, "incorrect_loss_per_char": 0.5785788407441909, "correct_loss_per_token": 2.7831933975219725, "incorrect_loss_per_token": 3.2597389221191406, "correct_loss_uncond": -11.714109420776367, "incorrect_loss_uncond": -10.999074618021647}, "model_output": [{"sum_logits": -13.569069862365723, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.4213809967041, "logits_per_token": -4.523023287455241, "logits_per_char": -0.7981805801391602, "num_chars": 17}, {"sum_logits": -20.325702667236328, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -32.19060516357422, "logits_per_token": -3.3876171112060547, "logits_per_char": -0.5978147843304802, "num_chars": 34}, {"sum_logits": -27.831933975219727, "num_tokens": 10, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -39.546043395996094, "logits_per_token": -2.7831933975219725, "logits_per_char": -0.5154061847262912, "num_chars": 54}, {"sum_logits": -22.422916412353516, "num_tokens": 12, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -33.70292663574219, "logits_per_token": -1.8685763676961262, "logits_per_char": -0.33974115776293207, "num_chars": 66}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 888, "native_id": "Mercury_412774", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 22.097461700439453, "incorrect_loss_raw": 24.66402816772461, "correct_loss_per_char": 0.6905456781387329, "incorrect_loss_per_char": 0.7246528114591326, "correct_loss_per_token": 2.7621827125549316, "incorrect_loss_per_token": 2.8439764976501465, "correct_loss_uncond": -15.380451202392578, "incorrect_loss_uncond": -15.800127665201822}, "model_output": [{"sum_logits": -22.362247467041016, "num_tokens": 8, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -37.1235237121582, "logits_per_token": -2.795280933380127, "logits_per_char": -0.6988202333450317, "num_chars": 32}, {"sum_logits": -25.781761169433594, "num_tokens": 9, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -42.224037170410156, "logits_per_token": -2.864640129937066, "logits_per_char": -0.7366217476981026, "num_chars": 35}, {"sum_logits": -22.097461700439453, "num_tokens": 8, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -37.47791290283203, "logits_per_token": -2.7621827125549316, "logits_per_char": -0.6905456781387329, "num_chars": 32}, {"sum_logits": -25.84807586669922, "num_tokens": 9, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -42.04490661621094, "logits_per_token": -2.8720084296332464, "logits_per_char": -0.7385164533342634, "num_chars": 35}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 889, "native_id": "MEA_2013_5_12", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 21.005680084228516, "incorrect_loss_raw": 25.076359430948894, "correct_loss_per_char": 0.2763905274240594, "incorrect_loss_per_char": 0.3586820977805087, "correct_loss_per_token": 1.400378672281901, "incorrect_loss_per_token": 1.9289507254576073, "correct_loss_uncond": -23.244220733642578, "incorrect_loss_uncond": -29.576985041300457}, "model_output": [{"sum_logits": -28.834150314331055, "num_tokens": 13, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -55.978145599365234, "logits_per_token": -2.2180115626408505, "logits_per_char": -0.40047430992126465, "num_chars": 72}, {"sum_logits": -22.413528442382812, "num_tokens": 13, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -54.41875457763672, "logits_per_token": -1.7241175724909856, "logits_per_char": -0.29491484792608963, "num_chars": 76}, {"sum_logits": -21.005680084228516, "num_tokens": 15, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -44.249900817871094, "logits_per_token": -1.400378672281901, "logits_per_char": -0.2763905274240594, "num_chars": 76}, {"sum_logits": -23.981399536132812, "num_tokens": 13, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -53.563133239746094, "logits_per_token": -1.8447230412409856, "logits_per_char": -0.38065713549417163, "num_chars": 63}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 890, "native_id": "Mercury_7098473", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.418109893798828, "incorrect_loss_raw": 15.560261726379395, "correct_loss_per_char": 0.49518293492934284, "incorrect_loss_per_char": 0.6559226594286374, "correct_loss_per_token": 2.806036631266276, "incorrect_loss_per_token": 3.0594716147770953, "correct_loss_uncond": -13.515829086303711, "incorrect_loss_uncond": -10.832216580708822}, "model_output": [{"sum_logits": -11.006707191467285, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -22.713504791259766, "logits_per_token": -3.6689023971557617, "logits_per_char": -0.7861933708190918, "num_chars": 14}, {"sum_logits": -8.418109893798828, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -21.93393898010254, "logits_per_token": -2.806036631266276, "logits_per_char": -0.49518293492934284, "num_chars": 17}, {"sum_logits": -17.35505485534668, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -28.888410568237305, "logits_per_token": -2.89250914255778, "logits_per_char": -0.6427798094572844, "num_chars": 27}, {"sum_logits": -18.31902313232422, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -27.575519561767578, "logits_per_token": -2.6170033046177457, "logits_per_char": -0.5387947980095359, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 891, "native_id": "Mercury_417593", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.34986114501953, "incorrect_loss_raw": 27.427257537841797, "correct_loss_per_char": 0.4970561008827359, "incorrect_loss_per_char": 0.5396616715863889, "correct_loss_per_token": 2.534986114501953, "incorrect_loss_per_token": 2.569388337568803, "correct_loss_uncond": -14.909843444824219, "incorrect_loss_uncond": -21.535301208496094}, "model_output": [{"sum_logits": -25.34986114501953, "num_tokens": 10, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -40.25970458984375, "logits_per_token": -2.534986114501953, "logits_per_char": -0.4970561008827359, "num_chars": 51}, {"sum_logits": -25.080425262451172, "num_tokens": 10, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -40.29426574707031, "logits_per_token": -2.5080425262451174, "logits_per_char": -0.5225088596343994, "num_chars": 48}, {"sum_logits": -23.25316619873047, "num_tokens": 11, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -49.35206604003906, "logits_per_token": -2.113924199884588, "logits_per_char": -0.4559444352692249, "num_chars": 51}, {"sum_logits": -33.94818115234375, "num_tokens": 11, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -57.2413444519043, "logits_per_token": -3.0861982865767046, "logits_per_char": -0.6405317198555425, "num_chars": 53}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 892, "native_id": "Mercury_7081743", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.142101287841797, "incorrect_loss_raw": 22.59027099609375, "correct_loss_per_char": 0.29690394682042737, "incorrect_loss_per_char": 0.4388875901324194, "correct_loss_per_token": 2.1631573268345425, "incorrect_loss_per_token": 2.6294399222942317, "correct_loss_uncond": -16.976276397705078, "incorrect_loss_uncond": -13.727106730143229}, "model_output": [{"sum_logits": -22.115283966064453, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -35.68785095214844, "logits_per_token": -2.4572537740071616, "logits_per_char": -0.4172695087936689, "num_chars": 53}, {"sum_logits": -15.142101287841797, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -32.118377685546875, "logits_per_token": -2.1631573268345425, "logits_per_char": -0.29690394682042737, "num_chars": 51}, {"sum_logits": -24.65084457397461, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -40.67024230957031, "logits_per_token": -3.5215492248535156, "logits_per_char": -0.5030784606933594, "num_chars": 49}, {"sum_logits": -21.004684448242188, "num_tokens": 11, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -32.59403991699219, "logits_per_token": -1.909516768022017, "logits_per_char": -0.39631480091022997, "num_chars": 53}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 893, "native_id": "Mercury_7018410", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.23746395111084, "incorrect_loss_raw": 13.659410158793131, "correct_loss_per_char": 0.2843739986419678, "incorrect_loss_per_char": 0.3756559057267828, "correct_loss_per_token": 1.279682993888855, "incorrect_loss_per_token": 1.9322652271815708, "correct_loss_uncond": -11.349722862243652, "incorrect_loss_uncond": -16.8316011428833}, "model_output": [{"sum_logits": -6.77933406829834, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -26.863874435424805, "logits_per_token": -1.3558668136596679, "logits_per_char": -0.2510864469740126, "num_chars": 27}, {"sum_logits": -9.299741744995117, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -30.046781539916992, "logits_per_token": -1.3285345349993025, "logits_per_char": -0.30999139149983723, "num_chars": 30}, {"sum_logits": -10.23746395111084, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -21.587186813354492, "logits_per_token": -1.279682993888855, "logits_per_char": -0.2843739986419678, "num_chars": 36}, {"sum_logits": -24.899154663085938, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.5623779296875, "logits_per_token": -3.112394332885742, "logits_per_char": -0.5658898787064985, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 894, "native_id": "Mercury_402563", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.2350919246673584, "incorrect_loss_raw": 3.6263303756713867, "correct_loss_per_char": 0.4116973082224528, "incorrect_loss_per_char": 1.2087767918904622, "correct_loss_per_token": 0.6175459623336792, "incorrect_loss_per_token": 1.8131651878356934, "correct_loss_uncond": -8.772499322891235, "incorrect_loss_uncond": -6.37281068166097}, "model_output": [{"sum_logits": -1.661031723022461, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -8.535137176513672, "logits_per_token": -0.8305158615112305, "logits_per_char": -0.5536772410074869, "num_chars": 3}, {"sum_logits": -1.2350919246673584, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": true, "sum_logits_uncond": -10.007591247558594, "logits_per_token": -0.6175459623336792, "logits_per_char": -0.4116973082224528, "num_chars": 3}, {"sum_logits": -5.063210487365723, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -9.855981826782227, "logits_per_token": -2.5316052436828613, "logits_per_char": -1.6877368291219075, "num_chars": 3}, {"sum_logits": -4.154748916625977, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -11.606304168701172, "logits_per_token": -2.0773744583129883, "logits_per_char": -1.3849163055419922, "num_chars": 3}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 895, "native_id": "Mercury_416407", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 20.695323944091797, "incorrect_loss_raw": 18.40430482228597, "correct_loss_per_char": 0.4498983466106912, "incorrect_loss_per_char": 0.44962283291124655, "correct_loss_per_token": 3.449220657348633, "incorrect_loss_per_token": 3.4370793024698894, "correct_loss_uncond": -20.657745361328125, "incorrect_loss_uncond": -17.290330568949383}, "model_output": [{"sum_logits": -20.288175582885742, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -36.7067985534668, "logits_per_token": -4.057635116577148, "logits_per_char": -0.5338993574443617, "num_chars": 38}, {"sum_logits": -22.126201629638672, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -38.591434478759766, "logits_per_token": -4.425240325927734, "logits_per_char": -0.5822684639378598, "num_chars": 38}, {"sum_logits": -20.695323944091797, "num_tokens": 6, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -41.35306930541992, "logits_per_token": -3.449220657348633, "logits_per_char": -0.4498983466106912, "num_chars": 46}, {"sum_logits": -12.798537254333496, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -31.785673141479492, "logits_per_token": -1.8283624649047852, "logits_per_char": -0.23270067735151811, "num_chars": 55}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 896, "native_id": "Mercury_SC_400400", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.221759796142578, "incorrect_loss_raw": 5.45822016398112, "correct_loss_per_char": 0.5277199745178223, "incorrect_loss_per_char": 0.5677081117726335, "correct_loss_per_token": 4.221759796142578, "incorrect_loss_per_token": 5.45822016398112, "correct_loss_uncond": -9.042282104492188, "incorrect_loss_uncond": -7.232460021972656}, "model_output": [{"sum_logits": -4.221759796142578, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.264041900634766, "logits_per_token": -4.221759796142578, "logits_per_char": -0.5277199745178223, "num_chars": 8}, {"sum_logits": -5.44045877456665, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.414356231689453, "logits_per_token": -5.44045877456665, "logits_per_char": -0.6044954193962945, "num_chars": 9}, {"sum_logits": -5.178223609924316, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.603271484375, "logits_per_token": -5.178223609924316, "logits_per_char": -0.5753581788804796, "num_chars": 9}, {"sum_logits": -5.755978107452393, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.054412841796875, "logits_per_token": -5.755978107452393, "logits_per_char": -0.5232707370411266, "num_chars": 11}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 897, "native_id": "MCAS_2000_8_22", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.708353996276855, "incorrect_loss_raw": 15.237365086873373, "correct_loss_per_char": 0.1988399240035045, "incorrect_loss_per_char": 0.1904518609904741, "correct_loss_per_token": 0.9817721247673035, "incorrect_loss_per_token": 1.063795739128476, "correct_loss_uncond": -24.231602668762207, "incorrect_loss_uncond": -23.578311284383137}, "model_output": [{"sum_logits": -9.947319984436035, "num_tokens": 14, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.936702728271484, "logits_per_token": -0.7105228560311454, "logits_per_char": -0.1344232430329194, "num_chars": 74}, {"sum_logits": -20.274662017822266, "num_tokens": 14, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -42.019676208496094, "logits_per_token": -1.448190144130162, "logits_per_char": -0.25030446935583045, "num_chars": 81}, {"sum_logits": -15.490113258361816, "num_tokens": 15, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -40.49065017700195, "logits_per_token": -1.032674217224121, "logits_per_char": -0.1866278705826725, "num_chars": 83}, {"sum_logits": -15.708353996276855, "num_tokens": 16, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -39.93995666503906, "logits_per_token": -0.9817721247673035, "logits_per_char": -0.1988399240035045, "num_chars": 79}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 898, "native_id": "MCAS_8_2014_8", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.574874877929688, "incorrect_loss_raw": 22.84819984436035, "correct_loss_per_char": 0.4361809178402549, "incorrect_loss_per_char": 0.4929703966440137, "correct_loss_per_token": 2.071859359741211, "incorrect_loss_per_token": 2.8325596461220393, "correct_loss_uncond": -18.27487564086914, "incorrect_loss_uncond": -12.05827522277832}, "model_output": [{"sum_logits": -16.574874877929688, "num_tokens": 8, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.84975051879883, "logits_per_token": -2.071859359741211, "logits_per_char": -0.4361809178402549, "num_chars": 38}, {"sum_logits": -17.224945068359375, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -30.620723724365234, "logits_per_token": -2.870824178059896, "logits_per_char": -0.43062362670898435, "num_chars": 40}, {"sum_logits": -18.507558822631836, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.967906951904297, "logits_per_token": -2.643936974661691, "logits_per_char": -0.44065616244361516, "num_chars": 42}, {"sum_logits": -32.812095642089844, "num_tokens": 11, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -43.130794525146484, "logits_per_token": -2.9829177856445312, "logits_per_char": -0.6076314007794416, "num_chars": 54}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 899, "native_id": "Mercury_7206430", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 18.222867965698242, "incorrect_loss_raw": 25.926172892252605, "correct_loss_per_char": 0.3718952646060866, "incorrect_loss_per_char": 0.5154542166052217, "correct_loss_per_token": 2.603266852242606, "incorrect_loss_per_token": 3.0363691915279976, "correct_loss_uncond": -23.487653732299805, "incorrect_loss_uncond": -19.048861185709637}, "model_output": [{"sum_logits": -18.222867965698242, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -41.71052169799805, "logits_per_token": -2.603266852242606, "logits_per_char": -0.3718952646060866, "num_chars": 49}, {"sum_logits": -29.10032844543457, "num_tokens": 10, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -51.836883544921875, "logits_per_token": -2.910032844543457, "logits_per_char": -0.5290968808260831, "num_chars": 55}, {"sum_logits": -23.781002044677734, "num_tokens": 9, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -39.53270721435547, "logits_per_token": -2.642333560519748, "logits_per_char": -0.41001727663237475, "num_chars": 58}, {"sum_logits": -24.897188186645508, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -43.555511474609375, "logits_per_token": -3.556741169520787, "logits_per_char": -0.6072484923572075, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 900, "native_id": "Mercury_7185343", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.573352813720703, "incorrect_loss_raw": 20.341760953267414, "correct_loss_per_char": 0.46941829863048734, "incorrect_loss_per_char": 0.48193810658080144, "correct_loss_per_token": 2.112382343837193, "incorrect_loss_per_token": 2.3149207327100965, "correct_loss_uncond": -24.236377716064453, "incorrect_loss_uncond": -17.20112133026123}, "model_output": [{"sum_logits": -14.199202537536621, "num_tokens": 10, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -38.94233322143555, "logits_per_token": -1.4199202537536622, "logits_per_char": -0.35498006343841554, "num_chars": 40}, {"sum_logits": -23.179977416992188, "num_tokens": 8, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -36.44056701660156, "logits_per_token": -2.8974971771240234, "logits_per_char": -0.565365302853468, "num_chars": 41}, {"sum_logits": -23.646102905273438, "num_tokens": 9, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -37.24574661254883, "logits_per_token": -2.627344767252604, "logits_per_char": -0.5254689534505208, "num_chars": 45}, {"sum_logits": -29.573352813720703, "num_tokens": 14, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -53.809730529785156, "logits_per_token": -2.112382343837193, "logits_per_char": -0.46941829863048734, "num_chars": 63}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 901, "native_id": "OHAT_2010_8_8", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.1687605381011963, "incorrect_loss_raw": 2.717559059460958, "correct_loss_per_char": 0.28806913982738147, "incorrect_loss_per_char": 0.2594579574389335, "correct_loss_per_token": 1.5843802690505981, "incorrect_loss_per_token": 1.269961012734307, "correct_loss_uncond": -13.320890188217163, "incorrect_loss_uncond": -13.657803336779276}, "model_output": [{"sum_logits": -3.734968900680542, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -15.928037643432617, "logits_per_token": -1.867484450340271, "logits_per_char": -0.3734968900680542, "num_chars": 10}, {"sum_logits": -3.1687605381011963, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -16.48965072631836, "logits_per_token": -1.5843802690505981, "logits_per_char": -0.28806913982738147, "num_chars": 11}, {"sum_logits": -2.8189749717712402, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -16.53949737548828, "logits_per_token": -1.4094874858856201, "logits_per_char": -0.28189749717712403, "num_chars": 10}, {"sum_logits": -1.5987333059310913, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": true, "sum_logits_uncond": -16.658552169799805, "logits_per_token": -0.5329111019770304, "logits_per_char": -0.1229794850716224, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 902, "native_id": "Mercury_405462", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.65477180480957, "incorrect_loss_raw": 25.838254928588867, "correct_loss_per_char": 0.5474066393715995, "incorrect_loss_per_char": 0.4741170126109304, "correct_loss_per_token": 3.4060857560899525, "incorrect_loss_per_token": 2.867034832636515, "correct_loss_uncond": -13.467920303344727, "incorrect_loss_uncond": -18.628868738810223}, "model_output": [{"sum_logits": -23.54446792602539, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -40.80538558959961, "logits_per_token": -2.943058490753174, "logits_per_char": -0.4804993454290896, "num_chars": 49}, {"sum_logits": -30.65477180480957, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -44.1226921081543, "logits_per_token": -3.4060857560899525, "logits_per_char": -0.5474066393715995, "num_chars": 56}, {"sum_logits": -27.852510452270508, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -42.157222747802734, "logits_per_token": -3.4815638065338135, "logits_per_char": -0.5461276559268727, "num_chars": 51}, {"sum_logits": -26.117786407470703, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -50.43876266479492, "logits_per_token": -2.1764822006225586, "logits_per_char": -0.39572403647682886, "num_chars": 66}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 903, "native_id": "Mercury_SC_LBS10337", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 20.486600875854492, "incorrect_loss_raw": 25.569308598836262, "correct_loss_per_char": 0.4097320175170898, "incorrect_loss_per_char": 0.5600480645419363, "correct_loss_per_token": 2.048660087585449, "incorrect_loss_per_token": 2.9072071882782793, "correct_loss_uncond": -17.962358474731445, "incorrect_loss_uncond": -13.608367284138998}, "model_output": [{"sum_logits": -20.486600875854492, "num_tokens": 10, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -38.44895935058594, "logits_per_token": -2.048660087585449, "logits_per_char": -0.4097320175170898, "num_chars": 50}, {"sum_logits": -32.0789794921875, "num_tokens": 10, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -44.141212463378906, "logits_per_token": -3.20789794921875, "logits_per_char": -0.534649658203125, "num_chars": 60}, {"sum_logits": -17.480981826782227, "num_tokens": 7, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.800506591796875, "logits_per_token": -2.4972831181117465, "logits_per_char": -0.5141465243171243, "num_chars": 34}, {"sum_logits": -27.147964477539062, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -41.59130859375, "logits_per_token": -3.01644049750434, "logits_per_char": -0.6313480111055596, "num_chars": 43}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 904, "native_id": "Mercury_7142520", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.8575921058654785, "incorrect_loss_raw": 8.67890214920044, "correct_loss_per_char": 0.285740712109734, "incorrect_loss_per_char": 0.4930656292859246, "correct_loss_per_token": 2.4287960529327393, "incorrect_loss_per_token": 4.33945107460022, "correct_loss_uncond": -12.057096004486084, "incorrect_loss_uncond": -8.159201463063559}, "model_output": [{"sum_logits": -9.175987243652344, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -18.226131439208984, "logits_per_token": -4.587993621826172, "logits_per_char": -0.5734992027282715, "num_chars": 16}, {"sum_logits": -7.1016621589660645, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -15.246206283569336, "logits_per_token": -3.5508310794830322, "logits_per_char": -0.41774483288035674, "num_chars": 17}, {"sum_logits": -4.8575921058654785, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -16.914688110351562, "logits_per_token": -2.4287960529327393, "logits_per_char": -0.285740712109734, "num_chars": 17}, {"sum_logits": -9.75905704498291, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -17.041973114013672, "logits_per_token": -4.879528522491455, "logits_per_char": -0.48795285224914553, "num_chars": 20}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 905, "native_id": "Mercury_SC_405501", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.531423568725586, "incorrect_loss_raw": 20.108197530110676, "correct_loss_per_char": 0.2925395435757107, "incorrect_loss_per_char": 0.5808963401644838, "correct_loss_per_token": 1.5044890812465124, "incorrect_loss_per_token": 2.630919615427653, "correct_loss_uncond": -15.802005767822266, "incorrect_loss_uncond": -11.774541219075521}, "model_output": [{"sum_logits": -22.76378631591797, "num_tokens": 8, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -33.598995208740234, "logits_per_token": -2.845473289489746, "logits_per_char": -0.6695231269387638, "num_chars": 34}, {"sum_logits": -17.838459014892578, "num_tokens": 8, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -32.939453125, "logits_per_token": -2.2298073768615723, "logits_per_char": -0.5096702575683594, "num_chars": 35}, {"sum_logits": -19.722347259521484, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -29.10976791381836, "logits_per_token": -2.8174781799316406, "logits_per_char": -0.5634956359863281, "num_chars": 35}, {"sum_logits": -10.531423568725586, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -26.33342933654785, "logits_per_token": -1.5044890812465124, "logits_per_char": -0.2925395435757107, "num_chars": 36}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 906, "native_id": "Mercury_7009555", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.775867462158203, "incorrect_loss_raw": 15.015278498331705, "correct_loss_per_char": 0.5095126711089035, "incorrect_loss_per_char": 0.48894880703670274, "correct_loss_per_token": 2.9551734924316406, "incorrect_loss_per_token": 2.265043228391617, "correct_loss_uncond": -14.318977355957031, "incorrect_loss_uncond": -17.962858200073242}, "model_output": [{"sum_logits": -18.34882164001465, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -38.82748794555664, "logits_per_token": -2.621260234287807, "logits_per_char": -0.5560248981822621, "num_chars": 33}, {"sum_logits": -15.120433807373047, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -30.474987030029297, "logits_per_token": -2.5200723012288413, "logits_per_char": -0.5600160669397425, "num_chars": 27}, {"sum_logits": -14.775867462158203, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -29.094844818115234, "logits_per_token": -2.9551734924316406, "logits_per_char": -0.5095126711089035, "num_chars": 29}, {"sum_logits": -11.576580047607422, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -29.631935119628906, "logits_per_token": -1.6537971496582031, "logits_per_char": -0.3508054559881037, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 907, "native_id": "Mercury_409085", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 11.021668434143066, "incorrect_loss_raw": 8.61473290125529, "correct_loss_per_char": 1.2246298260158963, "incorrect_loss_per_char": 1.0222089842513755, "correct_loss_per_token": 1.5745240620204382, "incorrect_loss_per_token": 1.3421328824663918, "correct_loss_uncond": -12.06904125213623, "incorrect_loss_uncond": -13.779076099395752}, "model_output": [{"sum_logits": -11.021668434143066, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -23.090709686279297, "logits_per_token": -1.5745240620204382, "logits_per_char": -1.2246298260158963, "num_chars": 9}, {"sum_logits": -11.800647735595703, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -24.430252075195312, "logits_per_token": -1.6858068193708147, "logits_per_char": -1.3111830817328558, "num_chars": 9}, {"sum_logits": -6.32395076751709, "num_tokens": 6, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -21.2509708404541, "logits_per_token": -1.0539917945861816, "logits_per_char": -0.7904938459396362, "num_chars": 8}, {"sum_logits": -7.719600200653076, "num_tokens": 6, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -21.50020408630371, "logits_per_token": -1.2866000334421794, "logits_per_char": -0.9649500250816345, "num_chars": 8}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 908, "native_id": "NYSEDREGENTS_2012_4_2", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.37401580810547, "incorrect_loss_raw": 11.160808881123861, "correct_loss_per_char": 1.3645013173421223, "incorrect_loss_per_char": 0.7747967248708534, "correct_loss_per_token": 5.458005269368489, "incorrect_loss_per_token": 3.720269627041287, "correct_loss_uncond": -6.408840179443359, "incorrect_loss_uncond": -9.706374168395996}, "model_output": [{"sum_logits": -16.37401580810547, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -22.782855987548828, "logits_per_token": -5.458005269368489, "logits_per_char": -1.3645013173421223, "num_chars": 12}, {"sum_logits": -15.572244644165039, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.19935417175293, "logits_per_token": -5.19074821472168, "logits_per_char": -1.19786497262808, "num_chars": 13}, {"sum_logits": -8.604583740234375, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -17.28524398803711, "logits_per_token": -2.868194580078125, "logits_per_char": -0.5061519847196692, "num_chars": 17}, {"sum_logits": -9.305598258972168, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -20.11695098876953, "logits_per_token": -3.101866086324056, "logits_per_char": -0.6203732172648112, "num_chars": 15}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 909, "native_id": "Mercury_407539", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.83357048034668, "incorrect_loss_raw": 25.33608563741048, "correct_loss_per_char": 0.6075345316240864, "incorrect_loss_per_char": 0.6703258352373337, "correct_loss_per_token": 2.6905100686209544, "incorrect_loss_per_token": 3.32261628196353, "correct_loss_uncond": -13.11465835571289, "incorrect_loss_uncond": -11.552156448364258}, "model_output": [{"sum_logits": -28.01244354248047, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -36.9959602355957, "logits_per_token": -3.5015554428100586, "logits_per_char": -0.6514521754065226, "num_chars": 43}, {"sum_logits": -21.854076385498047, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -37.64577102661133, "logits_per_token": -2.731759548187256, "logits_per_char": -0.5906507131215688, "num_chars": 37}, {"sum_logits": -26.14173698425293, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.02299499511719, "logits_per_token": -3.7345338548932756, "logits_per_char": -0.7688746171839097, "num_chars": 34}, {"sum_logits": -18.83357048034668, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -31.94822883605957, "logits_per_token": -2.6905100686209544, "logits_per_char": -0.6075345316240864, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 910, "native_id": "ACTAAP_2013_7_16", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 6.888792991638184, "incorrect_loss_raw": 6.769918600718181, "correct_loss_per_char": 0.861099123954773, "incorrect_loss_per_char": 0.8397811178177123, "correct_loss_per_token": 6.888792991638184, "incorrect_loss_per_token": 6.769918600718181, "correct_loss_uncond": -7.133734703063965, "incorrect_loss_uncond": -5.718954563140869}, "model_output": [{"sum_logits": -8.27517032623291, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -12.559499740600586, "logits_per_token": -8.27517032623291, "logits_per_char": -1.1821671894618444, "num_chars": 7}, {"sum_logits": -6.8910112380981445, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -13.184402465820312, "logits_per_token": -6.8910112380981445, "logits_per_char": -0.7656679153442383, "num_chars": 9}, {"sum_logits": -5.143574237823486, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -11.72271728515625, "logits_per_token": -5.143574237823486, "logits_per_char": -0.5715082486470541, "num_chars": 9}, {"sum_logits": -6.888792991638184, "num_tokens": 1, "num_tokens_all": 174, "is_greedy": false, "sum_logits_uncond": -14.022527694702148, "logits_per_token": -6.888792991638184, "logits_per_char": -0.861099123954773, "num_chars": 8}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 911, "native_id": "AKDE&ED_2008_8_34", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.874091148376465, "incorrect_loss_raw": 10.940217971801758, "correct_loss_per_char": 0.21872475412156847, "incorrect_loss_per_char": 0.34751032071854526, "correct_loss_per_token": 1.574818229675293, "incorrect_loss_per_token": 2.228988483217028, "correct_loss_uncond": -15.366671562194824, "incorrect_loss_uncond": -16.83374532063802}, "model_output": [{"sum_logits": -12.376395225524902, "num_tokens": 6, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.691204071044922, "logits_per_token": -2.062732537587484, "logits_per_char": -0.3992385556620936, "num_chars": 31}, {"sum_logits": -10.707623481750488, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -29.357755661010742, "logits_per_token": -2.676905870437622, "logits_per_char": -0.35692078272501626, "num_chars": 30}, {"sum_logits": -9.736635208129883, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -25.272930145263672, "logits_per_token": -1.9473270416259765, "logits_per_char": -0.286371623768526, "num_chars": 34}, {"sum_logits": -7.874091148376465, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -23.24076271057129, "logits_per_token": -1.574818229675293, "logits_per_char": -0.21872475412156847, "num_chars": 36}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 912, "native_id": "MCAS_2004_8_3", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.7930870056152344, "incorrect_loss_raw": 7.084125916163127, "correct_loss_per_char": 0.21485284658578727, "incorrect_loss_per_char": 0.635307146443261, "correct_loss_per_token": 2.7930870056152344, "incorrect_loss_per_token": 5.664657513300578, "correct_loss_uncond": -10.67225456237793, "incorrect_loss_uncond": -6.650934457778931}, "model_output": [{"sum_logits": -2.7930870056152344, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.465341567993164, "logits_per_token": -2.7930870056152344, "logits_per_char": -0.21485284658578727, "num_chars": 13}, {"sum_logits": -3.729802370071411, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.066205978393555, "logits_per_token": -3.729802370071411, "logits_per_char": -0.3729802370071411, "num_chars": 10}, {"sum_logits": -8.516810417175293, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.036880493164062, "logits_per_token": -4.2584052085876465, "logits_per_char": -0.5323006510734558, "num_chars": 16}, {"sum_logits": -9.005764961242676, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.102094650268555, "logits_per_token": -9.005764961242676, "logits_per_char": -1.0006405512491863, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 913, "native_id": "Mercury_415272", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.725519180297852, "incorrect_loss_raw": 4.532389402389526, "correct_loss_per_char": 0.36788186572846915, "incorrect_loss_per_char": 0.1951791226406068, "correct_loss_per_token": 1.2875865300496419, "incorrect_loss_per_token": 0.6775195492638483, "correct_loss_uncond": -13.099817276000977, "incorrect_loss_uncond": -15.043208360671997}, "model_output": [{"sum_logits": -5.807531356811523, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -20.146652221679688, "logits_per_token": -0.9679218928019205, "logits_per_char": -0.2903765678405762, "num_chars": 20}, {"sum_logits": -7.725519180297852, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -20.825336456298828, "logits_per_token": -1.2875865300496419, "logits_per_char": -0.36788186572846915, "num_chars": 21}, {"sum_logits": -2.8567306995391846, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": true, "sum_logits_uncond": -17.764469146728516, "logits_per_token": -0.5713461399078369, "logits_per_char": -0.13603479521615164, "num_chars": 21}, {"sum_logits": -4.932906150817871, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -20.815671920776367, "logits_per_token": -0.4932906150817871, "logits_per_char": -0.15912600486509262, "num_chars": 31}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 914, "native_id": "Mercury_405387", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 8.402337074279785, "incorrect_loss_raw": 9.178144137064615, "correct_loss_per_char": 1.680467414855957, "incorrect_loss_per_char": 1.835628827412923, "correct_loss_per_token": 2.8007790247599282, "incorrect_loss_per_token": 3.0593813790215383, "correct_loss_uncond": -9.08322811126709, "incorrect_loss_uncond": -7.794961929321289}, "model_output": [{"sum_logits": -8.330696105957031, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -15.97514820098877, "logits_per_token": -2.7768987019856772, "logits_per_char": -1.6661392211914063, "num_chars": 5}, {"sum_logits": -8.402337074279785, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -17.485565185546875, "logits_per_token": -2.8007790247599282, "logits_per_char": -1.680467414855957, "num_chars": 5}, {"sum_logits": -9.109573364257812, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -17.698347091674805, "logits_per_token": -3.036524454752604, "logits_per_char": -1.8219146728515625, "num_chars": 5}, {"sum_logits": -10.094162940979004, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -17.24582290649414, "logits_per_token": -3.3647209803263345, "logits_per_char": -2.0188325881958007, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 915, "native_id": "Mercury_7116323", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.143726348876953, "incorrect_loss_raw": 23.379976908365887, "correct_loss_per_char": 0.23186436637503202, "incorrect_loss_per_char": 0.5010419176871568, "correct_loss_per_token": 1.571525149875217, "incorrect_loss_per_token": 3.240848795572917, "correct_loss_uncond": -26.711021423339844, "incorrect_loss_uncond": -17.683115641276043}, "model_output": [{"sum_logits": -18.013952255249023, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.08029556274414, "logits_per_token": -3.0023253758748374, "logits_per_char": -0.5003875626458062, "num_chars": 36}, {"sum_logits": -22.614347457885742, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -37.45510482788086, "logits_per_token": -3.7690579096476235, "logits_per_char": -0.5025410546196831, "num_chars": 45}, {"sum_logits": -29.51163101196289, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -50.65387725830078, "logits_per_token": -2.951163101196289, "logits_per_char": -0.5001971357959812, "num_chars": 59}, {"sum_logits": -14.143726348876953, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -40.8547477722168, "logits_per_token": -1.571525149875217, "logits_per_char": -0.23186436637503202, "num_chars": 61}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 916, "native_id": "Mercury_7213430", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.216422080993652, "incorrect_loss_raw": 20.631675720214844, "correct_loss_per_char": 0.18880602972848073, "incorrect_loss_per_char": 0.3775558332769142, "correct_loss_per_token": 1.101368506749471, "incorrect_loss_per_token": 1.9691489961412216, "correct_loss_uncond": -20.30397129058838, "incorrect_loss_uncond": -16.47748311360677}, "model_output": [{"sum_logits": -16.923343658447266, "num_tokens": 12, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -36.15370178222656, "logits_per_token": -1.4102786382039387, "logits_per_char": -0.24176205226353237, "num_chars": 70}, {"sum_logits": -13.216422080993652, "num_tokens": 12, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -33.52039337158203, "logits_per_token": -1.101368506749471, "logits_per_char": -0.18880602972848073, "num_chars": 70}, {"sum_logits": -23.224716186523438, "num_tokens": 10, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.5419921875, "logits_per_token": -2.3224716186523438, "logits_per_char": -0.46449432373046873, "num_chars": 50}, {"sum_logits": -21.746967315673828, "num_tokens": 10, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -36.63178253173828, "logits_per_token": -2.1746967315673826, "logits_per_char": -0.42641112383674173, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 917, "native_id": "Mercury_7234360", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.85806655883789, "incorrect_loss_raw": 27.620202382405598, "correct_loss_per_char": 0.43225811689327925, "incorrect_loss_per_char": 0.5379193007235098, "correct_loss_per_token": 2.1072583198547363, "incorrect_loss_per_token": 2.661606737011858, "correct_loss_uncond": -25.013450622558594, "incorrect_loss_uncond": -24.55194600423177}, "model_output": [{"sum_logits": -16.85806655883789, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -41.871517181396484, "logits_per_token": -2.1072583198547363, "logits_per_char": -0.43225811689327925, "num_chars": 39}, {"sum_logits": -18.950176239013672, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -43.23223876953125, "logits_per_token": -2.1055751376681857, "logits_per_char": -0.42111502753363717, "num_chars": 45}, {"sum_logits": -28.169448852539062, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -50.43518829345703, "logits_per_token": -3.129938761393229, "logits_per_char": -0.5868635177612305, "num_chars": 48}, {"sum_logits": -35.74098205566406, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -62.84901809692383, "logits_per_token": -2.7493063119741588, "logits_per_char": -0.6057793568756621, "num_chars": 59}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 918, "native_id": "Mercury_405685", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.0655460357666, "incorrect_loss_raw": 16.913710594177246, "correct_loss_per_char": 0.6294313031573628, "incorrect_loss_per_char": 0.6300900345052627, "correct_loss_per_token": 3.383193254470825, "incorrect_loss_per_token": 2.8169996988205686, "correct_loss_uncond": -13.870115280151367, "incorrect_loss_uncond": -6.161118507385254}, "model_output": [{"sum_logits": -14.598261833190918, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -20.30739974975586, "logits_per_token": -2.9196523666381835, "logits_per_char": -0.6635573560541327, "num_chars": 22}, {"sum_logits": -15.459342956542969, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.45428466796875, "logits_per_token": -2.576557159423828, "logits_per_char": -0.6183737182617187, "num_chars": 25}, {"sum_logits": -20.68352699279785, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.46280288696289, "logits_per_token": -2.954789570399693, "logits_per_char": -0.6083390291999368, "num_chars": 34}, {"sum_logits": -27.0655460357666, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -40.93566131591797, "logits_per_token": -3.383193254470825, "logits_per_char": -0.6294313031573628, "num_chars": 43}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 919, "native_id": "Mercury_7236740", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.779869079589844, "incorrect_loss_raw": 4.407823403676351, "correct_loss_per_char": 0.7224836349487305, "incorrect_loss_per_char": 0.5318463804860594, "correct_loss_per_token": 2.889934539794922, "incorrect_loss_per_token": 2.2039117018381753, "correct_loss_uncond": -5.6746978759765625, "incorrect_loss_uncond": -7.487233638763428}, "model_output": [{"sum_logits": -5.093136787414551, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -11.178924560546875, "logits_per_token": -2.5465683937072754, "logits_per_char": -0.7275909696306501, "num_chars": 7}, {"sum_logits": -5.779869079589844, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -11.454566955566406, "logits_per_token": -2.889934539794922, "logits_per_char": -0.7224836349487305, "num_chars": 8}, {"sum_logits": -4.9423346519470215, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -11.62992000579834, "logits_per_token": -2.4711673259735107, "logits_per_char": -0.5491482946607802, "num_chars": 9}, {"sum_logits": -3.1879987716674805, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": true, "sum_logits_uncond": -12.876326560974121, "logits_per_token": -1.5939993858337402, "logits_per_char": -0.31879987716674807, "num_chars": 10}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 920, "native_id": "Mercury_7116235", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.36137866973877, "incorrect_loss_raw": 19.254741668701172, "correct_loss_per_char": 0.38403446674346925, "incorrect_loss_per_char": 0.4944096097024368, "correct_loss_per_token": 2.1944826671055386, "incorrect_loss_per_token": 2.881643068222773, "correct_loss_uncond": -20.99152660369873, "incorrect_loss_uncond": -17.073449452718098}, "model_output": [{"sum_logits": -15.36137866973877, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -36.3529052734375, "logits_per_token": -2.1944826671055386, "logits_per_char": -0.38403446674346925, "num_chars": 40}, {"sum_logits": -16.501676559448242, "num_tokens": 6, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.036895751953125, "logits_per_token": -2.750279426574707, "logits_per_char": -0.4024799160841035, "num_chars": 41}, {"sum_logits": -23.555875778198242, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.040924072265625, "logits_per_token": -3.3651251111711775, "logits_per_char": -0.5888968944549561, "num_chars": 40}, {"sum_logits": -17.70667266845703, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -34.90675354003906, "logits_per_token": -2.529524666922433, "logits_per_char": -0.4918520185682509, "num_chars": 36}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 921, "native_id": "Mercury_SC_405357", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.987712860107422, "incorrect_loss_raw": 18.306074778238933, "correct_loss_per_char": 0.5425060817173549, "incorrect_loss_per_char": 0.5338329567055187, "correct_loss_per_token": 2.7125304085867747, "incorrect_loss_per_token": 3.2402726067437064, "correct_loss_uncond": -19.132862091064453, "incorrect_loss_uncond": -16.69113286336263}, "model_output": [{"sum_logits": -17.03341293334961, "num_tokens": 5, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -29.10226058959961, "logits_per_token": -3.406682586669922, "logits_per_char": -0.5677804311116537, "num_chars": 30}, {"sum_logits": -18.552783966064453, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -39.06172561645508, "logits_per_token": -3.092130661010742, "logits_per_char": -0.5622055747292258, "num_chars": 33}, {"sum_logits": -18.987712860107422, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -38.120574951171875, "logits_per_token": -2.7125304085867747, "logits_per_char": -0.5425060817173549, "num_chars": 35}, {"sum_logits": -19.332027435302734, "num_tokens": 6, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -36.82763671875, "logits_per_token": -3.2220045725504556, "logits_per_char": -0.47151286427567646, "num_chars": 41}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 922, "native_id": "Mercury_7042945", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 6.8088226318359375, "incorrect_loss_raw": 8.92199738820394, "correct_loss_per_char": 0.2960357666015625, "incorrect_loss_per_char": 0.39346838527255584, "correct_loss_per_token": 1.7022056579589844, "incorrect_loss_per_token": 2.0916268189748126, "correct_loss_uncond": -12.912124633789062, "incorrect_loss_uncond": -14.21389102935791}, "model_output": [{"sum_logits": -7.818657875061035, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -27.538400650024414, "logits_per_token": -1.9546644687652588, "logits_per_char": -0.39093289375305174, "num_chars": 20}, {"sum_logits": -6.8088226318359375, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.720947265625, "logits_per_token": -1.7022056579589844, "logits_per_char": -0.2960357666015625, "num_chars": 23}, {"sum_logits": -8.332351684570312, "num_tokens": 5, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -22.689199447631836, "logits_per_token": -1.6664703369140625, "logits_per_char": -0.3471813201904297, "num_chars": 24}, {"sum_logits": -10.614982604980469, "num_tokens": 4, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -19.180065155029297, "logits_per_token": -2.653745651245117, "logits_per_char": -0.4422909418741862, "num_chars": 24}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 923, "native_id": "Mercury_7106750", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.297039985656738, "incorrect_loss_raw": 17.111868222554524, "correct_loss_per_char": 0.44991294075460997, "incorrect_loss_per_char": 0.37999823534930194, "correct_loss_per_token": 1.6996711095174153, "incorrect_loss_per_token": 1.5997497165013874, "correct_loss_uncond": -20.371508598327637, "incorrect_loss_uncond": -20.913565635681152}, "model_output": [{"sum_logits": -22.798614501953125, "num_tokens": 14, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -39.65258026123047, "logits_per_token": -1.6284724644252233, "logits_per_char": -0.414520263671875, "num_chars": 55}, {"sum_logits": -9.531201362609863, "num_tokens": 9, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -37.445152282714844, "logits_per_token": -1.0590223736233182, "logits_per_char": -0.21180447472466363, "num_chars": 45}, {"sum_logits": -19.005788803100586, "num_tokens": 9, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -36.97856903076172, "logits_per_token": -2.1117543114556208, "logits_per_char": -0.5136699676513672, "num_chars": 37}, {"sum_logits": -15.297039985656738, "num_tokens": 9, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -35.668548583984375, "logits_per_token": -1.6996711095174153, "logits_per_char": -0.44991294075460997, "num_chars": 34}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 924, "native_id": "MDSA_2009_4_34", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.35116195678711, "incorrect_loss_raw": 22.794867833455402, "correct_loss_per_char": 0.541985034942627, "incorrect_loss_per_char": 0.39122810366384786, "correct_loss_per_token": 3.372351328531901, "incorrect_loss_per_token": 2.1347581521429198, "correct_loss_uncond": -7.781219482421875, "incorrect_loss_uncond": -19.28006871541341}, "model_output": [{"sum_logits": -11.089654922485352, "num_tokens": 8, "num_tokens_all": 290, "is_greedy": false, "sum_logits_uncond": -26.913496017456055, "logits_per_token": -1.386206865310669, "logits_per_char": -0.27047938835330126, "num_chars": 41}, {"sum_logits": -30.35116195678711, "num_tokens": 9, "num_tokens_all": 291, "is_greedy": false, "sum_logits_uncond": -38.132381439208984, "logits_per_token": -3.372351328531901, "logits_per_char": -0.541985034942627, "num_chars": 56}, {"sum_logits": -32.14048767089844, "num_tokens": 11, "num_tokens_all": 293, "is_greedy": false, "sum_logits_uncond": -53.079307556152344, "logits_per_token": -2.9218625155362217, "logits_per_char": -0.5101664709666419, "num_chars": 63}, {"sum_logits": -25.154460906982422, "num_tokens": 12, "num_tokens_all": 294, "is_greedy": false, "sum_logits_uncond": -46.23200607299805, "logits_per_token": -2.0962050755818686, "logits_per_char": -0.39303845167160034, "num_chars": 64}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 925, "native_id": "Mercury_7016310", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 5.890256881713867, "incorrect_loss_raw": 6.147777875264485, "correct_loss_per_char": 0.7362821102142334, "incorrect_loss_per_char": 0.7727265796333395, "correct_loss_per_token": 5.890256881713867, "incorrect_loss_per_token": 6.147777875264485, "correct_loss_uncond": -9.718515396118164, "incorrect_loss_uncond": -7.368040402730306}, "model_output": [{"sum_logits": -6.689910888671875, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -12.664569854736328, "logits_per_token": -6.689910888671875, "logits_per_char": -0.7433234320746528, "num_chars": 9}, {"sum_logits": -5.835428714752197, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.369495391845703, "logits_per_token": -5.835428714752197, "logits_per_char": -0.7294285893440247, "num_chars": 8}, {"sum_logits": -5.890256881713867, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.608772277832031, "logits_per_token": -5.890256881713867, "logits_per_char": -0.7362821102142334, "num_chars": 8}, {"sum_logits": -5.917994022369385, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.513389587402344, "logits_per_token": -5.917994022369385, "logits_per_char": -0.8454277174813407, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 926, "native_id": "VASoL_2007_3_1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.5945940017700195, "incorrect_loss_raw": 10.784717241923014, "correct_loss_per_char": 1.8986485004425049, "incorrect_loss_per_char": 1.173503466326781, "correct_loss_per_token": 7.5945940017700195, "incorrect_loss_per_token": 5.9087982177734375, "correct_loss_uncond": -2.923985481262207, "incorrect_loss_uncond": -4.788065274556478}, "model_output": [{"sum_logits": -7.5945940017700195, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -10.518579483032227, "logits_per_token": -7.5945940017700195, "logits_per_char": -1.8986485004425049, "num_chars": 4}, {"sum_logits": -12.20382308959961, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -17.551513671875, "logits_per_token": -6.101911544799805, "logits_per_char": -1.1094384626908735, "num_chars": 11}, {"sum_logits": -11.367794036865234, "num_tokens": 4, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -19.724021911621094, "logits_per_token": -2.8419485092163086, "logits_per_char": -0.9473161697387695, "num_chars": 12}, {"sum_logits": -8.7825345993042, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -9.442811965942383, "logits_per_token": -8.7825345993042, "logits_per_char": -1.4637557665507, "num_chars": 6}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 927, "native_id": "Mercury_7030468", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.806880950927734, "incorrect_loss_raw": 32.23810577392578, "correct_loss_per_char": 0.4501563852483576, "incorrect_loss_per_char": 0.4449452387366964, "correct_loss_per_token": 1.9806880950927734, "incorrect_loss_per_token": 2.1329255149478006, "correct_loss_uncond": -16.82382583618164, "incorrect_loss_uncond": -17.06832504272461}, "model_output": [{"sum_logits": -19.806880950927734, "num_tokens": 10, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -36.630706787109375, "logits_per_token": -1.9806880950927734, "logits_per_char": -0.4501563852483576, "num_chars": 44}, {"sum_logits": -27.26272964477539, "num_tokens": 14, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -38.951385498046875, "logits_per_token": -1.9473378317696708, "logits_per_char": -0.3839821076728928, "num_chars": 71}, {"sum_logits": -40.08580780029297, "num_tokens": 20, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -62.32461929321289, "logits_per_token": -2.0042903900146483, "logits_per_char": -0.47721199762253536, "num_chars": 84}, {"sum_logits": -29.365779876708984, "num_tokens": 12, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -46.643287658691406, "logits_per_token": -2.447148323059082, "logits_per_char": -0.47364161091466106, "num_chars": 62}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 928, "native_id": "Mercury_SC_402616", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 28.15302276611328, "incorrect_loss_raw": 17.105353037516277, "correct_loss_per_char": 0.7820284101698134, "incorrect_loss_per_char": 0.604735860427778, "correct_loss_per_token": 4.69217046101888, "incorrect_loss_per_token": 3.421070607503255, "correct_loss_uncond": -13.047073364257812, "incorrect_loss_uncond": -9.247116088867188}, "model_output": [{"sum_logits": -12.133121490478516, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -22.999082565307617, "logits_per_token": -2.426624298095703, "logits_per_char": -0.5275270213251528, "num_chars": 23}, {"sum_logits": -18.05814552307129, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -26.635478973388672, "logits_per_token": -3.611629104614258, "logits_per_char": -0.5825208233248803, "num_chars": 31}, {"sum_logits": -21.124792098999023, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.4228458404541, "logits_per_token": -4.224958419799805, "logits_per_char": -0.7041597366333008, "num_chars": 30}, {"sum_logits": -28.15302276611328, "num_tokens": 6, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -41.200096130371094, "logits_per_token": -4.69217046101888, "logits_per_char": -0.7820284101698134, "num_chars": 36}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 929, "native_id": "Mercury_405464", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 21.893150329589844, "incorrect_loss_raw": 31.72608820597331, "correct_loss_per_char": 0.40542870980721935, "incorrect_loss_per_char": 0.5745054270353979, "correct_loss_per_token": 2.432572258843316, "incorrect_loss_per_token": 2.8239653954139126, "correct_loss_uncond": -18.428810119628906, "incorrect_loss_uncond": -10.452590942382812}, "model_output": [{"sum_logits": -21.893150329589844, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -40.32196044921875, "logits_per_token": -2.432572258843316, "logits_per_char": -0.40542870980721935, "num_chars": 54}, {"sum_logits": -30.985321044921875, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -38.959136962890625, "logits_per_token": -3.0985321044921874, "logits_per_char": -0.5738022415726273, "num_chars": 54}, {"sum_logits": -31.134342193603516, "num_tokens": 11, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -39.5634651184082, "logits_per_token": -2.830394744873047, "logits_per_char": -0.5987373498769907, "num_chars": 52}, {"sum_logits": -33.05860137939453, "num_tokens": 13, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -48.01343536376953, "logits_per_token": -2.5429693368765025, "logits_per_char": -0.5509766896565755, "num_chars": 60}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 930, "native_id": "Mercury_7205608", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.081418991088867, "incorrect_loss_raw": 16.79061730702718, "correct_loss_per_char": 0.6579799652099609, "incorrect_loss_per_char": 0.6075737669774884, "correct_loss_per_token": 4.770354747772217, "incorrect_loss_per_token": 3.4049998018476693, "correct_loss_uncond": -7.702020645141602, "incorrect_loss_uncond": -9.973750750223795}, "model_output": [{"sum_logits": -14.770781517028809, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -22.779630661010742, "logits_per_token": -3.692695379257202, "logits_per_char": -0.5275279113224575, "num_chars": 28}, {"sum_logits": -19.081418991088867, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -26.78343963623047, "logits_per_token": -4.770354747772217, "logits_per_char": -0.6579799652099609, "num_chars": 29}, {"sum_logits": -17.937301635742188, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -28.403976440429688, "logits_per_token": -2.9895502726236978, "logits_per_char": -0.6643445050274884, "num_chars": 27}, {"sum_logits": -17.663768768310547, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -29.1094970703125, "logits_per_token": -3.532753753662109, "logits_per_char": -0.6308488845825195, "num_chars": 28}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 931, "native_id": "Mercury_7015208", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.088134765625, "incorrect_loss_raw": 18.54381497701009, "correct_loss_per_char": 0.796142578125, "incorrect_loss_per_char": 0.6098601898927798, "correct_loss_per_token": 4.617626953125, "incorrect_loss_per_token": 3.7087629954020187, "correct_loss_uncond": -14.580368041992188, "incorrect_loss_uncond": -15.245417912801107}, "model_output": [{"sum_logits": -18.924997329711914, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -33.37471389770508, "logits_per_token": -3.784999465942383, "logits_per_char": -0.6525861148176522, "num_chars": 29}, {"sum_logits": -23.088134765625, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -37.66850280761719, "logits_per_token": -4.617626953125, "logits_per_char": -0.796142578125, "num_chars": 29}, {"sum_logits": -14.360624313354492, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -31.766124725341797, "logits_per_token": -2.8721248626708986, "logits_per_char": -0.4786874771118164, "num_chars": 30}, {"sum_logits": -22.345823287963867, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -36.22686004638672, "logits_per_token": -4.469164657592773, "logits_per_char": -0.6983069777488708, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 932, "native_id": "Mercury_SC_409666", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.771986961364746, "incorrect_loss_raw": 16.80927340189616, "correct_loss_per_char": 0.358454249121926, "incorrect_loss_per_char": 0.43884417705046824, "correct_loss_per_token": 2.253140994480678, "incorrect_loss_per_token": 2.10115917523702, "correct_loss_uncond": -13.57524585723877, "incorrect_loss_uncond": -15.857673009236654}, "model_output": [{"sum_logits": -15.049844741821289, "num_tokens": 8, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -33.791114807128906, "logits_per_token": -1.8812305927276611, "logits_per_char": -0.41805124282836914, "num_chars": 36}, {"sum_logits": -13.488208770751953, "num_tokens": 8, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -27.279556274414062, "logits_per_token": -1.6860260963439941, "logits_per_char": -0.3372052192687988, "num_chars": 40}, {"sum_logits": -21.889766693115234, "num_tokens": 8, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -36.93016815185547, "logits_per_token": -2.7362208366394043, "logits_per_char": -0.5612760690542368, "num_chars": 39}, {"sum_logits": -15.771986961364746, "num_tokens": 7, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -29.347232818603516, "logits_per_token": -2.253140994480678, "logits_per_char": -0.358454249121926, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 933, "native_id": "Mercury_7230353", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.552265644073486, "incorrect_loss_raw": 4.8190938631693525, "correct_loss_per_char": 0.3793554703394572, "incorrect_loss_per_char": 0.4731314816000142, "correct_loss_per_token": 4.552265644073486, "incorrect_loss_per_token": 4.173294107119243, "correct_loss_uncond": -9.11193037033081, "incorrect_loss_uncond": -9.888617992401123}, "model_output": [{"sum_logits": -3.8313395977020264, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.217485427856445, "logits_per_token": -3.8313395977020264, "logits_per_char": -0.5473342282431466, "num_chars": 7}, {"sum_logits": -4.552265644073486, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.664196014404297, "logits_per_token": -4.552265644073486, "logits_per_char": -0.3793554703394572, "num_chars": 12}, {"sum_logits": -3.874798536300659, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.993424415588379, "logits_per_token": -1.9373992681503296, "logits_per_char": -0.25831990242004393, "num_chars": 15}, {"sum_logits": -6.751143455505371, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.912225723266602, "logits_per_token": -6.751143455505371, "logits_per_char": -0.6137403141368519, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 934, "native_id": "Mercury_7150343", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.261993408203125, "incorrect_loss_raw": 16.223244349161785, "correct_loss_per_char": 0.295672607421875, "incorrect_loss_per_char": 0.3105277137960342, "correct_loss_per_token": 1.8068881564670138, "incorrect_loss_per_token": 1.8149898758640994, "correct_loss_uncond": -28.932960510253906, "incorrect_loss_uncond": -22.888011296590168}, "model_output": [{"sum_logits": -15.936151504516602, "num_tokens": 9, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -42.36463165283203, "logits_per_token": -1.7706835005018446, "logits_per_char": -0.2897482091730291, "num_chars": 55}, {"sum_logits": -16.261993408203125, "num_tokens": 9, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -45.19495391845703, "logits_per_token": -1.8068881564670138, "logits_per_char": -0.295672607421875, "num_chars": 55}, {"sum_logits": -16.037118911743164, "num_tokens": 8, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -37.51319885253906, "logits_per_token": -2.0046398639678955, "logits_per_char": -0.314453311994964, "num_chars": 51}, {"sum_logits": -16.696462631225586, "num_tokens": 10, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -37.455936431884766, "logits_per_token": -1.6696462631225586, "logits_per_char": -0.3273816202201095, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 935, "native_id": "Mercury_7026723", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.092280864715576, "incorrect_loss_raw": 5.883577903111775, "correct_loss_per_char": 0.2995459332185633, "incorrect_loss_per_char": 0.49787622292836503, "correct_loss_per_token": 1.6974269549051921, "incorrect_loss_per_token": 2.411270684666104, "correct_loss_uncond": -14.19016695022583, "incorrect_loss_uncond": -9.433987061182657}, "model_output": [{"sum_logits": -4.217963218688965, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -13.728349685668945, "logits_per_token": -2.1089816093444824, "logits_per_char": -0.4686625798543294, "num_chars": 9}, {"sum_logits": -3.883441686630249, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -14.464568138122559, "logits_per_token": -1.9417208433151245, "logits_per_char": -0.3883441686630249, "num_chars": 10}, {"sum_logits": -9.549328804016113, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -17.759777069091797, "logits_per_token": -3.1831096013387046, "logits_per_char": -0.6366219202677409, "num_chars": 15}, {"sum_logits": -5.092280864715576, "num_tokens": 3, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -19.282447814941406, "logits_per_token": -1.6974269549051921, "logits_per_char": -0.2995459332185633, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 936, "native_id": "Mercury_7024273", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 24.373802185058594, "incorrect_loss_raw": 28.99784978230794, "correct_loss_per_char": 0.4779176899031097, "incorrect_loss_per_char": 0.666423008704541, "correct_loss_per_token": 2.4373802185058593, "incorrect_loss_per_token": 3.6921476460007763, "correct_loss_uncond": -13.06875991821289, "incorrect_loss_uncond": -11.657435099283854}, "model_output": [{"sum_logits": -29.35955810546875, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -41.438209533691406, "logits_per_token": -3.262173122829861, "logits_per_char": -0.6524346245659722, "num_chars": 45}, {"sum_logits": -23.472820281982422, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -36.84173583984375, "logits_per_token": -2.9341025352478027, "logits_per_char": -0.5334731882268732, "num_chars": 44}, {"sum_logits": -24.373802185058594, "num_tokens": 10, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -37.442562103271484, "logits_per_token": -2.4373802185058593, "logits_per_char": -0.4779176899031097, "num_chars": 51}, {"sum_logits": -34.161170959472656, "num_tokens": 7, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -43.685909271240234, "logits_per_token": -4.880167279924665, "logits_per_char": -0.8133612133207775, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 937, "native_id": "AKDE&ED_2008_8_40", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 46.731842041015625, "incorrect_loss_raw": 41.426127115885414, "correct_loss_per_char": 0.6772730730581975, "incorrect_loss_per_char": 0.6797472604036132, "correct_loss_per_token": 5.19242689344618, "incorrect_loss_per_token": 4.960442790278682, "correct_loss_uncond": -12.270877838134766, "incorrect_loss_uncond": -12.842965443929037}, "model_output": [{"sum_logits": -38.98822021484375, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -52.532737731933594, "logits_per_token": -4.873527526855469, "logits_per_char": -0.6962182181222099, "num_chars": 56}, {"sum_logits": -38.24037170410156, "num_tokens": 8, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -52.1150016784668, "logits_per_token": -4.780046463012695, "logits_per_char": -0.670883714107045, "num_chars": 57}, {"sum_logits": -46.731842041015625, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -59.00271987915039, "logits_per_token": -5.19242689344618, "logits_per_char": -0.6772730730581975, "num_chars": 69}, {"sum_logits": -47.04978942871094, "num_tokens": 9, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -58.15953826904297, "logits_per_token": -5.227754380967882, "logits_per_char": -0.6721398489815849, "num_chars": 70}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 938, "native_id": "Mercury_183033", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.870242595672607, "incorrect_loss_raw": 9.880013465881348, "correct_loss_per_char": 0.2795353616986956, "incorrect_loss_per_char": 0.4336639732808943, "correct_loss_per_token": 1.1740485191345216, "incorrect_loss_per_token": 1.9760026931762695, "correct_loss_uncond": -29.70155382156372, "incorrect_loss_uncond": -26.148175875345867}, "model_output": [{"sum_logits": -6.112042427062988, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.592910766601562, "logits_per_token": -1.2224084854125976, "logits_per_char": -0.32168644352963094, "num_chars": 19}, {"sum_logits": -5.870242595672607, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -35.57179641723633, "logits_per_token": -1.1740485191345216, "logits_per_char": -0.2795353616986956, "num_chars": 21}, {"sum_logits": -12.549650192260742, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.76692581176758, "logits_per_token": -2.5099300384521483, "logits_per_char": -0.5019860076904297, "num_chars": 25}, {"sum_logits": -10.978347778320312, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -38.7247314453125, "logits_per_token": -2.1956695556640624, "logits_per_char": -0.47731946862262226, "num_chars": 23}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 939, "native_id": "Mercury_402364", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.78704833984375, "incorrect_loss_raw": 9.049244244893393, "correct_loss_per_char": 0.5136689685639881, "incorrect_loss_per_char": 0.6869386023429221, "correct_loss_per_token": 1.5410069056919642, "incorrect_loss_per_token": 2.419246900649298, "correct_loss_uncond": -14.275812149047852, "incorrect_loss_uncond": -11.257389386494955}, "model_output": [{"sum_logits": -9.005317687988281, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -18.683679580688477, "logits_per_token": -3.0017725626627603, "logits_per_char": -0.818665244362571, "num_chars": 11}, {"sum_logits": -8.737021446228027, "num_tokens": 3, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -18.421438217163086, "logits_per_token": -2.9123404820760093, "logits_per_char": -0.7942746769298207, "num_chars": 11}, {"sum_logits": -9.405393600463867, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -23.814783096313477, "logits_per_token": -1.343627657209124, "logits_per_char": -0.4478758857363746, "num_chars": 21}, {"sum_logits": -10.78704833984375, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -25.0628604888916, "logits_per_token": -1.5410069056919642, "logits_per_char": -0.5136689685639881, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 940, "native_id": "Mercury_7263183", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.023221492767334, "incorrect_loss_raw": 6.067032972971599, "correct_loss_per_char": 0.26187919533771015, "incorrect_loss_per_char": 0.3276639541886504, "correct_loss_per_token": 2.0077404975891113, "incorrect_loss_per_token": 2.038113021850586, "correct_loss_uncond": -10.8504319190979, "incorrect_loss_uncond": -11.339202404022217}, "model_output": [{"sum_logits": -4.028979778289795, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.369611740112305, "logits_per_token": -1.3429932594299316, "logits_per_char": -0.21205156727841026, "num_chars": 19}, {"sum_logits": -6.023221492767334, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.873653411865234, "logits_per_token": -2.0077404975891113, "logits_per_char": -0.26187919533771015, "num_chars": 23}, {"sum_logits": -6.456406593322754, "num_tokens": 2, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -18.226131439208984, "logits_per_token": -3.228203296661377, "logits_per_char": -0.4035254120826721, "num_chars": 16}, {"sum_logits": -7.715712547302246, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -17.622962951660156, "logits_per_token": -1.5431425094604492, "logits_per_char": -0.36741488320486887, "num_chars": 21}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 941, "native_id": "Mercury_7222530", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.851848602294922, "incorrect_loss_raw": 27.117953618367512, "correct_loss_per_char": 0.31481474096124823, "incorrect_loss_per_char": 0.5051441464697306, "correct_loss_per_token": 1.5390942891438801, "incorrect_loss_per_token": 2.625712186639959, "correct_loss_uncond": -17.688800811767578, "incorrect_loss_uncond": -14.04081662495931}, "model_output": [{"sum_logits": -28.407447814941406, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -43.85747146606445, "logits_per_token": -2.582495255903764, "logits_per_char": -0.4734574635823568, "num_chars": 60}, {"sum_logits": -13.851848602294922, "num_tokens": 9, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.5406494140625, "logits_per_token": -1.5390942891438801, "logits_per_char": -0.31481474096124823, "num_chars": 44}, {"sum_logits": -25.0377197265625, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -41.1780891418457, "logits_per_token": -2.50377197265625, "logits_per_char": -0.4724098061615566, "num_chars": 53}, {"sum_logits": -27.908693313598633, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -38.44075012207031, "logits_per_token": -2.7908693313598634, "logits_per_char": -0.5695651696652783, "num_chars": 49}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 942, "native_id": "OHAT_2009_8_36", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.802727699279785, "incorrect_loss_raw": 22.54292360941569, "correct_loss_per_char": 0.3795571204943535, "incorrect_loss_per_char": 0.6782081595852844, "correct_loss_per_token": 1.6447475221421983, "incorrect_loss_per_token": 3.645786354276869, "correct_loss_uncond": -10.583361625671387, "incorrect_loss_uncond": -6.301238377888997}, "model_output": [{"sum_logits": -21.933216094970703, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.694801330566406, "logits_per_token": -4.3866432189941404, "logits_per_char": -0.8435852344219501, "num_chars": 26}, {"sum_logits": -20.130516052246094, "num_tokens": 6, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.886959075927734, "logits_per_token": -3.355086008707682, "logits_per_char": -0.6100156379468513, "num_chars": 33}, {"sum_logits": -14.802727699279785, "num_tokens": 9, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.386089324951172, "logits_per_token": -1.6447475221421983, "logits_per_char": -0.3795571204943535, "num_chars": 39}, {"sum_logits": -25.565038681030273, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.95072555541992, "logits_per_token": -3.195629835128784, "logits_per_char": -0.5810236063870516, "num_chars": 44}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 943, "native_id": "Mercury_7141750", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 14.68521499633789, "incorrect_loss_raw": 25.084774653116863, "correct_loss_per_char": 0.24475358327229818, "incorrect_loss_per_char": 0.5133435075357418, "correct_loss_per_token": 1.3350195451216265, "incorrect_loss_per_token": 2.4352004166805385, "correct_loss_uncond": -25.833332061767578, "incorrect_loss_uncond": -18.541594823201496}, "model_output": [{"sum_logits": -19.290855407714844, "num_tokens": 8, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -39.0894775390625, "logits_per_token": -2.4113569259643555, "logits_per_char": -0.4705086684808498, "num_chars": 41}, {"sum_logits": -14.68521499633789, "num_tokens": 11, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -40.51854705810547, "logits_per_token": -1.3350195451216265, "logits_per_char": -0.24475358327229818, "num_chars": 60}, {"sum_logits": -25.521371841430664, "num_tokens": 12, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -46.28591537475586, "logits_per_token": -2.1267809867858887, "logits_per_char": -0.4726179970635308, "num_chars": 54}, {"sum_logits": -30.442096710205078, "num_tokens": 11, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -45.50371551513672, "logits_per_token": -2.7674633372913706, "logits_per_char": -0.5969038570628447, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 944, "native_id": "TIMSS_2011_4_pg45", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.5940632820129395, "incorrect_loss_raw": 15.98362668355306, "correct_loss_per_char": 0.44670960482429056, "incorrect_loss_per_char": 0.5244560877482096, "correct_loss_per_token": 2.5313544273376465, "incorrect_loss_per_token": 2.804892942640516, "correct_loss_uncond": -15.736854076385498, "incorrect_loss_uncond": -15.7710812886556}, "model_output": [{"sum_logits": -7.5940632820129395, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.330917358398438, "logits_per_token": -2.5313544273376465, "logits_per_char": -0.44670960482429056, "num_chars": 17}, {"sum_logits": -12.685964584350586, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -31.980873107910156, "logits_per_token": -2.5371929168701173, "logits_per_char": -0.48792171478271484, "num_chars": 26}, {"sum_logits": -19.055801391601562, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -31.607608795166016, "logits_per_token": -3.1759668986002603, "logits_per_char": -0.6351933797200521, "num_chars": 30}, {"sum_logits": -16.20911407470703, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -31.675642013549805, "logits_per_token": -2.701519012451172, "logits_per_char": -0.450253168741862, "num_chars": 36}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 945, "native_id": "MCAS_2014_5_5", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.850523948669434, "incorrect_loss_raw": 10.306381861368815, "correct_loss_per_char": 0.48754366238911945, "incorrect_loss_per_char": 0.9418165109096429, "correct_loss_per_token": 2.925261974334717, "incorrect_loss_per_token": 5.153190930684407, "correct_loss_uncond": -10.392830848693848, "incorrect_loss_uncond": -7.824384053548177}, "model_output": [{"sum_logits": -5.850523948669434, "num_tokens": 2, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -16.24335479736328, "logits_per_token": -2.925261974334717, "logits_per_char": -0.48754366238911945, "num_chars": 12}, {"sum_logits": -9.242785453796387, "num_tokens": 2, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -18.62438201904297, "logits_per_token": -4.621392726898193, "logits_per_char": -0.9242785453796387, "num_chars": 10}, {"sum_logits": -10.129542350769043, "num_tokens": 2, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -16.15218162536621, "logits_per_token": -5.0647711753845215, "logits_per_char": -1.0129542350769043, "num_chars": 10}, {"sum_logits": -11.546817779541016, "num_tokens": 2, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -19.615734100341797, "logits_per_token": -5.773408889770508, "logits_per_char": -0.8882167522723858, "num_chars": 13}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 946, "native_id": "Mercury_SC_409241", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.527606964111328, "incorrect_loss_raw": 17.550797780354817, "correct_loss_per_char": 0.516487717628479, "incorrect_loss_per_char": 0.6342228431978822, "correct_loss_per_token": 2.754601160685221, "incorrect_loss_per_token": 3.212712908548022, "correct_loss_uncond": -10.702747344970703, "incorrect_loss_uncond": -14.808411916097006}, "model_output": [{"sum_logits": -15.984886169433594, "num_tokens": 4, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -24.66149139404297, "logits_per_token": -3.9962215423583984, "logits_per_char": -0.8413097983912418, "num_chars": 19}, {"sum_logits": -16.95547866821289, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -35.7215576171875, "logits_per_token": -2.825913111368815, "logits_per_char": -0.5138023838852391, "num_chars": 33}, {"sum_logits": -16.527606964111328, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -27.23035430908203, "logits_per_token": -2.754601160685221, "logits_per_char": -0.516487717628479, "num_chars": 32}, {"sum_logits": -19.71202850341797, "num_tokens": 7, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -36.694580078125, "logits_per_token": -2.816004071916853, "logits_per_char": -0.5475563473171658, "num_chars": 36}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 947, "native_id": "Mercury_SC_401147", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.818626403808594, "incorrect_loss_raw": 16.164867719014484, "correct_loss_per_char": 0.4233893258231027, "incorrect_loss_per_char": 0.4695192601477319, "correct_loss_per_token": 2.469771067301432, "incorrect_loss_per_token": 2.3497521725911943, "correct_loss_uncond": -20.372650146484375, "incorrect_loss_uncond": -17.153326352437336}, "model_output": [{"sum_logits": -13.48592472076416, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.171707153320312, "logits_per_token": -2.24765412012736, "logits_per_char": -0.43502982970206966, "num_chars": 31}, {"sum_logits": -23.8289852142334, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.63508987426758, "logits_per_token": -3.4041407448904857, "logits_per_char": -0.7008525063009823, "num_chars": 34}, {"sum_logits": -14.818626403808594, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.19127655029297, "logits_per_token": -2.469771067301432, "logits_per_char": -0.4233893258231027, "num_chars": 35}, {"sum_logits": -11.179693222045898, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -38.14778518676758, "logits_per_token": -1.3974616527557373, "logits_per_char": -0.27267544444014385, "num_chars": 41}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 948, "native_id": "Mercury_SC_LBS10273", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.8940582275390625, "incorrect_loss_raw": 6.333109537760417, "correct_loss_per_char": 0.6991511753627232, "incorrect_loss_per_char": 0.6979257810683478, "correct_loss_per_token": 2.4470291137695312, "incorrect_loss_per_token": 3.1665547688802085, "correct_loss_uncond": -8.971131324768066, "incorrect_loss_uncond": -7.720053990681966}, "model_output": [{"sum_logits": -4.523037910461426, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.1472749710083, "logits_per_token": -2.261518955230713, "logits_per_char": -0.6461482729230609, "num_chars": 7}, {"sum_logits": -8.136015892028809, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.322368621826172, "logits_per_token": -4.068007946014404, "logits_per_char": -0.8136015892028808, "num_chars": 10}, {"sum_logits": -6.340274810791016, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.689846992492676, "logits_per_token": -3.170137405395508, "logits_per_char": -0.6340274810791016, "num_chars": 10}, {"sum_logits": -4.8940582275390625, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.865189552307129, "logits_per_token": -2.4470291137695312, "logits_per_char": -0.6991511753627232, "num_chars": 7}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 949, "native_id": "Mercury_401523", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.322467803955078, "incorrect_loss_raw": 25.888308842976887, "correct_loss_per_char": 0.4643172061804569, "incorrect_loss_per_char": 0.6093622949556756, "correct_loss_per_token": 2.553744633992513, "incorrect_loss_per_token": 3.251962056235662, "correct_loss_uncond": -11.222423553466797, "incorrect_loss_uncond": -10.593269983927408}, "model_output": [{"sum_logits": -15.322467803955078, "num_tokens": 6, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -26.544891357421875, "logits_per_token": -2.553744633992513, "logits_per_char": -0.4643172061804569, "num_chars": 33}, {"sum_logits": -17.607393264770508, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -26.33123016357422, "logits_per_token": -2.5153418949672153, "logits_per_char": -0.5178645077873679, "num_chars": 34}, {"sum_logits": -17.875778198242188, "num_tokens": 7, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -29.042755126953125, "logits_per_token": -2.553682599748884, "logits_per_char": -0.48312914049303207, "num_chars": 37}, {"sum_logits": -42.18175506591797, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -54.07075119018555, "logits_per_token": -4.686861673990886, "logits_per_char": -0.8270932365866268, "num_chars": 51}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 950, "native_id": "Mercury_401865", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 35.11870574951172, "incorrect_loss_raw": 25.749989827473957, "correct_loss_per_char": 0.71670828060228, "incorrect_loss_per_char": 0.6879047378291929, "correct_loss_per_token": 2.7014389038085938, "incorrect_loss_per_token": 2.1502050801405357, "correct_loss_uncond": -16.047653198242188, "incorrect_loss_uncond": -16.817092895507812}, "model_output": [{"sum_logits": -26.406204223632812, "num_tokens": 12, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -40.275489807128906, "logits_per_token": -2.200517018636068, "logits_per_char": -0.7335056728786893, "num_chars": 36}, {"sum_logits": -26.60245132446289, "num_tokens": 13, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -42.56385803222656, "logits_per_token": -2.0463424095740685, "logits_per_char": -0.7389569812350802, "num_chars": 36}, {"sum_logits": -24.241313934326172, "num_tokens": 11, "num_tokens_all": 229, "is_greedy": false, "sum_logits_uncond": -44.861900329589844, "logits_per_token": -2.20375581221147, "logits_per_char": -0.591251559373809, "num_chars": 41}, {"sum_logits": -35.11870574951172, "num_tokens": 13, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -51.166358947753906, "logits_per_token": -2.7014389038085938, "logits_per_char": -0.71670828060228, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 951, "native_id": "MCAS_2013_8_29435", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 4.7555365562438965, "incorrect_loss_raw": 7.4894154866536455, "correct_loss_per_char": 0.5283929506937662, "incorrect_loss_per_char": 0.7710012087115535, "correct_loss_per_token": 2.3777682781219482, "incorrect_loss_per_token": 3.7447077433268228, "correct_loss_uncond": -12.802237033843994, "incorrect_loss_uncond": -11.318989435831705}, "model_output": [{"sum_logits": -6.955667972564697, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.516647338867188, "logits_per_token": -3.4778339862823486, "logits_per_char": -0.8694584965705872, "num_chars": 8}, {"sum_logits": -4.7555365562438965, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -17.55777359008789, "logits_per_token": -2.3777682781219482, "logits_per_char": -0.5283929506937662, "num_chars": 9}, {"sum_logits": -5.42988920211792, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.889406204223633, "logits_per_token": -2.71494460105896, "logits_per_char": -0.6033210224575467, "num_chars": 9}, {"sum_logits": -10.08268928527832, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.019161224365234, "logits_per_token": -5.04134464263916, "logits_per_char": -0.8402241071065267, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 952, "native_id": "Mercury_SC_406720", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.27234935760498, "incorrect_loss_raw": 13.980376879374186, "correct_loss_per_char": 0.4214704253456809, "incorrect_loss_per_char": 0.7506280447549866, "correct_loss_per_token": 2.318087339401245, "incorrect_loss_per_token": 3.212546523412069, "correct_loss_uncond": -21.451611518859863, "incorrect_loss_uncond": -11.070036888122559}, "model_output": [{"sum_logits": -13.597326278686523, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.231719970703125, "logits_per_token": -3.399331569671631, "logits_per_char": -0.7554070154825846, "num_chars": 18}, {"sum_logits": -16.952861785888672, "num_tokens": 5, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.95052719116211, "logits_per_token": -3.390572357177734, "logits_per_char": -0.7370809472125509, "num_chars": 23}, {"sum_logits": -9.27234935760498, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -30.723960876464844, "logits_per_token": -2.318087339401245, "logits_per_char": -0.4214704253456809, "num_chars": 22}, {"sum_logits": -11.390942573547363, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -22.968994140625, "logits_per_token": -2.847735643386841, "logits_per_char": -0.7593961715698242, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 953, "native_id": "NYSEDREGENTS_2013_8_34", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 9.527602195739746, "incorrect_loss_raw": 25.512893676757812, "correct_loss_per_char": 0.232380541359506, "incorrect_loss_per_char": 0.4418724001133662, "correct_loss_per_token": 1.5879336992899578, "incorrect_loss_per_token": 2.3626507905072356, "correct_loss_uncond": -14.414870262145996, "incorrect_loss_uncond": -15.286951700846354}, "model_output": [{"sum_logits": -9.527602195739746, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -23.942472457885742, "logits_per_token": -1.5879336992899578, "logits_per_char": -0.232380541359506, "num_chars": 41}, {"sum_logits": -26.875595092773438, "num_tokens": 13, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -45.543243408203125, "logits_per_token": -2.06735346867488, "logits_per_char": -0.41347069373497597, "num_chars": 65}, {"sum_logits": -24.62732696533203, "num_tokens": 11, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -38.23704528808594, "logits_per_token": -2.2388479059392754, "logits_per_char": -0.43977369580950054, "num_chars": 56}, {"sum_logits": -25.03575897216797, "num_tokens": 9, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -38.61924743652344, "logits_per_token": -2.7817509969075522, "logits_per_char": -0.47237281079562204, "num_chars": 53}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 954, "native_id": "Mercury_7038833", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.763638496398926, "incorrect_loss_raw": 12.192850112915039, "correct_loss_per_char": 0.33667718953099746, "incorrect_loss_per_char": 0.9392429338962542, "correct_loss_per_token": 1.952727699279785, "incorrect_loss_per_token": 3.2888133790757923, "correct_loss_uncond": -29.145041465759277, "incorrect_loss_uncond": -9.902566274007162}, "model_output": [{"sum_logits": -11.946250915527344, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.746139526367188, "logits_per_token": -3.9820836385091147, "logits_per_char": -1.3273612128363714, "num_chars": 9}, {"sum_logits": -10.673839569091797, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.979148864746094, "logits_per_token": -3.557946523030599, "logits_per_char": -1.0673839569091796, "num_chars": 10}, {"sum_logits": -9.763638496398926, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.9086799621582, "logits_per_token": -1.952727699279785, "logits_per_char": -0.33667718953099746, "num_chars": 29}, {"sum_logits": -13.958459854125977, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.56096076965332, "logits_per_token": -2.3264099756876626, "logits_per_char": -0.4229836319432114, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 955, "native_id": "Mercury_175560", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.050218343734741, "incorrect_loss_raw": 3.5170370737711587, "correct_loss_per_char": 0.29288833481924875, "incorrect_loss_per_char": 0.5861728456285265, "correct_loss_per_token": 2.050218343734741, "incorrect_loss_per_token": 3.5170370737711587, "correct_loss_uncond": -10.395063638687134, "incorrect_loss_uncond": -7.8136240641276045}, "model_output": [{"sum_logits": -1.8685202598571777, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": true, "sum_logits_uncond": -12.206640243530273, "logits_per_token": -1.8685202598571777, "logits_per_char": -0.3114200433095296, "num_chars": 6}, {"sum_logits": -5.571109771728516, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -11.89634895324707, "logits_per_token": -5.571109771728516, "logits_per_char": -0.9285182952880859, "num_chars": 6}, {"sum_logits": -2.050218343734741, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -12.445281982421875, "logits_per_token": -2.050218343734741, "logits_per_char": -0.29288833481924875, "num_chars": 7}, {"sum_logits": -3.111481189727783, "num_tokens": 1, "num_tokens_all": 175, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -3.111481189727783, "logits_per_char": -0.5185801982879639, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 956, "native_id": "Mercury_7005005", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.270509719848633, "incorrect_loss_raw": 19.594382603963215, "correct_loss_per_char": 0.4147034287452698, "incorrect_loss_per_char": 0.6392548598495185, "correct_loss_per_token": 2.6541019439697267, "incorrect_loss_per_token": 2.910155268068667, "correct_loss_uncond": -14.70100212097168, "incorrect_loss_uncond": -9.953319549560547}, "model_output": [{"sum_logits": -16.03022003173828, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -26.373510360717773, "logits_per_token": -3.206044006347656, "logits_per_char": -0.5725078582763672, "num_chars": 28}, {"sum_logits": -13.933736801147461, "num_tokens": 6, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -25.690746307373047, "logits_per_token": -2.32228946685791, "logits_per_char": -0.4976334571838379, "num_chars": 28}, {"sum_logits": -13.270509719848633, "num_tokens": 5, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.971511840820312, "logits_per_token": -2.6541019439697267, "logits_per_char": -0.4147034287452698, "num_chars": 32}, {"sum_logits": -28.819190979003906, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.57884979248047, "logits_per_token": -3.202132331000434, "logits_per_char": -0.8476232640883502, "num_chars": 34}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 957, "native_id": "Mercury_183890", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.469695568084717, "incorrect_loss_raw": 15.109336217244467, "correct_loss_per_char": 0.5745919667757474, "incorrect_loss_per_char": 1.2155273800804502, "correct_loss_per_token": 1.8674238920211792, "incorrect_loss_per_token": 5.759427176581489, "correct_loss_uncond": -9.836023807525635, "incorrect_loss_uncond": -6.148176829020183}, "model_output": [{"sum_logits": -9.693660736083984, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -22.302499771118164, "logits_per_token": -1.615610122680664, "logits_per_char": -0.4616028921944754, "num_chars": 21}, {"sum_logits": -12.92701530456543, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.00395965576172, "logits_per_token": -4.30900510152181, "logits_per_char": -1.292701530456543, "num_chars": 10}, {"sum_logits": -22.707332611083984, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.466079711914062, "logits_per_token": -11.353666305541992, "logits_per_char": -1.892277717590332, "num_chars": 12}, {"sum_logits": -7.469695568084717, "num_tokens": 4, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.30571937561035, "logits_per_token": -1.8674238920211792, "logits_per_char": -0.5745919667757474, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 958, "native_id": "Mercury_7270358", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.740849494934082, "incorrect_loss_raw": 12.578847885131836, "correct_loss_per_char": 0.512676289207057, "incorrect_loss_per_char": 0.5532101443708605, "correct_loss_per_token": 1.9481698989868164, "incorrect_loss_per_token": 2.2616534974839952, "correct_loss_uncond": -18.776206016540527, "incorrect_loss_uncond": -16.645305633544922}, "model_output": [{"sum_logits": -12.328692436218262, "num_tokens": 6, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -27.99183464050293, "logits_per_token": -2.0547820727030435, "logits_per_char": -0.5136955181757609, "num_chars": 24}, {"sum_logits": -14.866096496582031, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -30.678672790527344, "logits_per_token": -2.973219299316406, "logits_per_char": -0.7824261313990543, "num_chars": 19}, {"sum_logits": -10.541754722595215, "num_tokens": 6, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -29.001953125, "logits_per_token": -1.7569591204325359, "logits_per_char": -0.36350878353776606, "num_chars": 29}, {"sum_logits": -9.740849494934082, "num_tokens": 5, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -28.51705551147461, "logits_per_token": -1.9481698989868164, "logits_per_char": -0.512676289207057, "num_chars": 19}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 959, "native_id": "MCAS_2013_5_29411", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 2.8729121685028076, "incorrect_loss_raw": 3.615630547205607, "correct_loss_per_char": 0.47881869475046795, "incorrect_loss_per_char": 0.7005170702934266, "correct_loss_per_token": 1.4364560842514038, "incorrect_loss_per_token": 3.0965816974639893, "correct_loss_uncond": -13.383374452590942, "incorrect_loss_uncond": -10.658480882644653}, "model_output": [{"sum_logits": -5.453484535217285, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -13.144001007080078, "logits_per_token": -5.453484535217285, "logits_per_char": -0.9089140892028809, "num_chars": 6}, {"sum_logits": -2.279114007949829, "num_tokens": 1, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -12.058414459228516, "logits_per_token": -2.279114007949829, "logits_per_char": -0.5697785019874573, "num_chars": 4}, {"sum_logits": -2.8729121685028076, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -16.25628662109375, "logits_per_token": -1.4364560842514038, "logits_per_char": -0.47881869475046795, "num_chars": 6}, {"sum_logits": -3.114293098449707, "num_tokens": 2, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.619918823242188, "logits_per_token": -1.5571465492248535, "logits_per_char": -0.6228586196899414, "num_chars": 5}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 960, "native_id": "ACTAAP_2007_7_31", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 42.50861358642578, "incorrect_loss_raw": 33.990194956461586, "correct_loss_per_char": 0.488604753866963, "incorrect_loss_per_char": 0.4822670998394943, "correct_loss_per_token": 2.2372954519171464, "incorrect_loss_per_token": 2.267719369946104, "correct_loss_uncond": -16.525413513183594, "incorrect_loss_uncond": -16.43438975016276}, "model_output": [{"sum_logits": -25.47308921813965, "num_tokens": 11, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -39.44807815551758, "logits_per_token": -2.3157353834672407, "logits_per_char": -0.4806243248705594, "num_chars": 53}, {"sum_logits": -26.67096519470215, "num_tokens": 12, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -39.101036071777344, "logits_per_token": -2.2225804328918457, "logits_per_char": -0.46791167008249385, "num_chars": 57}, {"sum_logits": -42.50861358642578, "num_tokens": 19, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -59.034027099609375, "logits_per_token": -2.2372954519171464, "logits_per_char": -0.488604753866963, "num_chars": 87}, {"sum_logits": -49.82653045654297, "num_tokens": 22, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -72.72463989257812, "logits_per_token": -2.264842293479226, "logits_per_char": -0.49826530456542967, "num_chars": 100}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 961, "native_id": "Mercury_7082023", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.8193371295928955, "incorrect_loss_raw": 7.190762758255005, "correct_loss_per_char": 0.25630337541753595, "incorrect_loss_per_char": 0.6311290735726828, "correct_loss_per_token": 2.8193371295928955, "incorrect_loss_per_token": 5.466338396072388, "correct_loss_uncond": -12.725411176681519, "incorrect_loss_uncond": -9.183873573939005}, "model_output": [{"sum_logits": -2.7189767360687256, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.18455696105957, "logits_per_token": -2.7189767360687256, "logits_per_char": -0.2091520566206712, "num_chars": 13}, {"sum_logits": -2.8193371295928955, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.544748306274414, "logits_per_token": -2.8193371295928955, "logits_per_char": -0.25630337541753595, "num_chars": 11}, {"sum_logits": -10.346546173095703, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -20.33608055114746, "logits_per_token": -5.173273086547852, "logits_per_char": -0.7390390123639788, "num_chars": 14}, {"sum_logits": -8.506765365600586, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -12.603271484375, "logits_per_token": -8.506765365600586, "logits_per_char": -0.9451961517333984, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 962, "native_id": "MCAS_2003_8_21", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.972187995910645, "incorrect_loss_raw": 9.398490905761719, "correct_loss_per_char": 1.1215234994888306, "incorrect_loss_per_char": 0.7902033704541105, "correct_loss_per_token": 2.243046998977661, "incorrect_loss_per_token": 3.1328303019205728, "correct_loss_uncond": -8.900067329406738, "incorrect_loss_uncond": -9.959073384602865}, "model_output": [{"sum_logits": -11.013065338134766, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -19.22406578063965, "logits_per_token": -3.6710217793782554, "logits_per_char": -0.7866475241524833, "num_chars": 14}, {"sum_logits": -8.972187995910645, "num_tokens": 4, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -17.872255325317383, "logits_per_token": -2.243046998977661, "logits_per_char": -1.1215234994888306, "num_chars": 8}, {"sum_logits": -11.363687515258789, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -20.767419815063477, "logits_per_token": -3.787895838419596, "logits_per_char": -1.136368751525879, "num_chars": 10}, {"sum_logits": -5.818719863891602, "num_tokens": 3, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -18.081207275390625, "logits_per_token": -1.9395732879638672, "logits_per_char": -0.4475938356839694, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 963, "native_id": "NYSEDREGENTS_2015_8_9", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 13.205926895141602, "incorrect_loss_raw": 11.375710805257162, "correct_loss_per_char": 0.3773121970040458, "incorrect_loss_per_char": 0.5100052914012033, "correct_loss_per_token": 2.6411853790283204, "incorrect_loss_per_token": 3.974888722101847, "correct_loss_uncond": -19.064741134643555, "incorrect_loss_uncond": -14.67621103922526}, "model_output": [{"sum_logits": -8.875611305236816, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -24.44145965576172, "logits_per_token": -2.9585371017456055, "logits_per_char": -0.38589614370594855, "num_chars": 23}, {"sum_logits": -13.205926895141602, "num_tokens": 5, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -32.270668029785156, "logits_per_token": -2.6411853790283204, "logits_per_char": -0.3773121970040458, "num_chars": 35}, {"sum_logits": -14.63852596282959, "num_tokens": 4, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -28.344608306884766, "logits_per_token": -3.6596314907073975, "logits_per_char": -0.5855410385131836, "num_chars": 25}, {"sum_logits": -10.612995147705078, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -25.36969757080078, "logits_per_token": -5.306497573852539, "logits_per_char": -0.5585786919844778, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 964, "native_id": "Mercury_7064750", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.018966674804688, "incorrect_loss_raw": 24.554133733113606, "correct_loss_per_char": 0.5686128789728339, "incorrect_loss_per_char": 0.5387055978890558, "correct_loss_per_token": 3.127370834350586, "incorrect_loss_per_token": 3.11291303079595, "correct_loss_uncond": -17.453224182128906, "incorrect_loss_uncond": -18.505238215128582}, "model_output": [{"sum_logits": -27.09967041015625, "num_tokens": 8, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -41.85930633544922, "logits_per_token": -3.3874588012695312, "logits_per_char": -0.6452302478608631, "num_chars": 42}, {"sum_logits": -24.495771408081055, "num_tokens": 7, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -46.3144416809082, "logits_per_token": -3.4993959154401506, "logits_per_char": -0.5696691025135129, "num_chars": 43}, {"sum_logits": -22.066959381103516, "num_tokens": 9, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -41.00436782836914, "logits_per_token": -2.4518843756781683, "logits_per_char": -0.4012174432927912, "num_chars": 55}, {"sum_logits": -25.018966674804688, "num_tokens": 8, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -42.472190856933594, "logits_per_token": -3.127370834350586, "logits_per_char": -0.5686128789728339, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 965, "native_id": "TIMSS_2007_8_pg113", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.882721424102783, "incorrect_loss_raw": 6.481823126475017, "correct_loss_per_char": 0.6975316320146833, "incorrect_loss_per_char": 0.8133841407247436, "correct_loss_per_token": 4.882721424102783, "incorrect_loss_per_token": 6.481823126475017, "correct_loss_uncond": -8.430375576019287, "incorrect_loss_uncond": -7.304152647654216}, "model_output": [{"sum_logits": -5.728456974029541, "num_tokens": 1, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -13.52247428894043, "logits_per_token": -5.728456974029541, "logits_per_char": -0.8183509962899345, "num_chars": 7}, {"sum_logits": -7.214905738830566, "num_tokens": 1, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -14.705488204956055, "logits_per_token": -7.214905738830566, "logits_per_char": -1.0307008198329382, "num_chars": 7}, {"sum_logits": -4.882721424102783, "num_tokens": 1, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -13.31309700012207, "logits_per_token": -4.882721424102783, "logits_per_char": -0.6975316320146833, "num_chars": 7}, {"sum_logits": -6.502106666564941, "num_tokens": 1, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -13.129964828491211, "logits_per_token": -6.502106666564941, "logits_per_char": -0.5911006060513583, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 966, "native_id": "Mercury_7173583", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 37.264339447021484, "incorrect_loss_raw": 22.45282491048177, "correct_loss_per_char": 0.7452867889404297, "incorrect_loss_per_char": 0.5943789197138263, "correct_loss_per_token": 2.8664876497708836, "incorrect_loss_per_token": 2.2742428602995695, "correct_loss_uncond": -5.163921356201172, "incorrect_loss_uncond": -7.965726852416992}, "model_output": [{"sum_logits": -19.64879608154297, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -29.08100700378418, "logits_per_token": -2.1831995646158853, "logits_per_char": -0.5954180630770597, "num_chars": 33}, {"sum_logits": -23.89400863647461, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -31.216087341308594, "logits_per_token": -2.654889848497179, "logits_per_char": -0.7466877698898315, "num_chars": 32}, {"sum_logits": -37.264339447021484, "num_tokens": 13, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -42.428260803222656, "logits_per_token": -2.8664876497708836, "logits_per_char": -0.7452867889404297, "num_chars": 50}, {"sum_logits": -23.815670013427734, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -30.958560943603516, "logits_per_token": -1.9846391677856445, "logits_per_char": -0.44103092617458767, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 967, "native_id": "Mercury_403930", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.7327938079834, "incorrect_loss_raw": 14.520442962646484, "correct_loss_per_char": 0.6820305310762845, "incorrect_loss_per_char": 0.5458326268351801, "correct_loss_per_token": 2.9554656346639, "incorrect_loss_per_token": 2.0314952676946465, "correct_loss_uncond": -10.304418563842773, "incorrect_loss_uncond": -13.36892318725586}, "model_output": [{"sum_logits": -14.557890892028809, "num_tokens": 5, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -28.358177185058594, "logits_per_token": -2.9115781784057617, "logits_per_char": -0.6932328996204195, "num_chars": 21}, {"sum_logits": -16.022789001464844, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -26.954946517944336, "logits_per_token": -2.0028486251831055, "logits_per_char": -0.5934366296838831, "num_chars": 27}, {"sum_logits": -17.7327938079834, "num_tokens": 6, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -28.037212371826172, "logits_per_token": -2.9554656346639, "logits_per_char": -0.6820305310762845, "num_chars": 26}, {"sum_logits": -12.9806489944458, "num_tokens": 11, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -28.3549747467041, "logits_per_token": -1.1800589994950728, "logits_per_char": -0.35082835120123784, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 968, "native_id": "Mercury_417118", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.097838401794434, "incorrect_loss_raw": 12.24540646870931, "correct_loss_per_char": 0.3955581913823667, "incorrect_loss_per_char": 0.53336586845408, "correct_loss_per_token": 3.0326128005981445, "incorrect_loss_per_token": 4.081802156236437, "correct_loss_uncond": -15.539341926574707, "incorrect_loss_uncond": -9.155621210734049}, "model_output": [{"sum_logits": -13.787378311157227, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -21.59635353088379, "logits_per_token": -4.595792770385742, "logits_per_char": -0.5994512309198794, "num_chars": 23}, {"sum_logits": -11.215450286865234, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -20.96670150756836, "logits_per_token": -3.738483428955078, "logits_per_char": -0.46731042861938477, "num_chars": 24}, {"sum_logits": -9.097838401794434, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -24.63718032836914, "logits_per_token": -3.0326128005981445, "logits_per_char": -0.3955581913823667, "num_chars": 23}, {"sum_logits": -11.733390808105469, "num_tokens": 3, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -21.64002799987793, "logits_per_token": -3.9111302693684897, "logits_per_char": -0.5333359458229758, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 969, "native_id": "Mercury_7143010", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.366920471191406, "incorrect_loss_raw": 21.939856847127277, "correct_loss_per_char": 0.5765209197998047, "incorrect_loss_per_char": 0.54052804046207, "correct_loss_per_token": 3.623845781598772, "incorrect_loss_per_token": 3.047708125341506, "correct_loss_uncond": -13.564380645751953, "incorrect_loss_uncond": -10.46135393778483}, "model_output": [{"sum_logits": -24.527881622314453, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -39.29209899902344, "logits_per_token": -3.5039830889020647, "logits_per_char": -0.5450640360514323, "num_chars": 45}, {"sum_logits": -25.366920471191406, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -38.93130111694336, "logits_per_token": -3.623845781598772, "logits_per_char": -0.5765209197998047, "num_chars": 44}, {"sum_logits": -26.750089645385742, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -32.708404541015625, "logits_per_token": -3.8214413779122487, "logits_per_char": -0.6220951080322266, "num_chars": 43}, {"sum_logits": -14.54159927368164, "num_tokens": 8, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -25.203128814697266, "logits_per_token": -1.817699909210205, "logits_per_char": -0.45442497730255127, "num_chars": 32}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 970, "native_id": "Mercury_SC_401801", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.003268241882324, "incorrect_loss_raw": 21.1780522664388, "correct_loss_per_char": 0.5501634120941162, "incorrect_loss_per_char": 0.5716025946681301, "correct_loss_per_token": 3.6677560806274414, "incorrect_loss_per_token": 3.4028944060915993, "correct_loss_uncond": -13.500971794128418, "incorrect_loss_uncond": -10.485280354817709}, "model_output": [{"sum_logits": -11.003268241882324, "num_tokens": 3, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.504240036010742, "logits_per_token": -3.6677560806274414, "logits_per_char": -0.5501634120941162, "num_chars": 20}, {"sum_logits": -14.832500457763672, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.55223846435547, "logits_per_token": -3.708125114440918, "logits_per_char": -0.5114655330263335, "num_chars": 29}, {"sum_logits": -23.119659423828125, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -35.927528381347656, "logits_per_token": -3.3028084891183034, "logits_per_char": -0.6084120901007402, "num_chars": 38}, {"sum_logits": -25.58199691772461, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.510231018066406, "logits_per_token": -3.197749614715576, "logits_per_char": -0.5949301608773165, "num_chars": 43}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 971, "native_id": "Mercury_410334", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.024883270263672, "incorrect_loss_raw": 22.878103892008465, "correct_loss_per_char": 0.4756220817565918, "incorrect_loss_per_char": 0.5376970486443547, "correct_loss_per_token": 2.7178404671805247, "incorrect_loss_per_token": 2.7694032532828197, "correct_loss_uncond": -18.493637084960938, "incorrect_loss_uncond": -15.232205708821615}, "model_output": [{"sum_logits": -21.83945083618164, "num_tokens": 7, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -38.08688735961914, "logits_per_token": -3.119921548025949, "logits_per_char": -0.64233678929946, "num_chars": 34}, {"sum_logits": -19.024883270263672, "num_tokens": 7, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -37.51852035522461, "logits_per_token": -2.7178404671805247, "logits_per_char": -0.4756220817565918, "num_chars": 40}, {"sum_logits": -20.35208511352539, "num_tokens": 8, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -36.5688591003418, "logits_per_token": -2.544010639190674, "logits_per_char": -0.452268558078342, "num_chars": 45}, {"sum_logits": -26.44277572631836, "num_tokens": 10, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -39.6751823425293, "logits_per_token": -2.644277572631836, "logits_per_char": -0.518485798555262, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 972, "native_id": "NAEP_2000_4_S12+3", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.830228805541992, "incorrect_loss_raw": 4.984403769175212, "correct_loss_per_char": 1.7660457611083984, "incorrect_loss_per_char": 0.9968807538350424, "correct_loss_per_token": 8.830228805541992, "incorrect_loss_per_token": 3.9907498359680176, "correct_loss_uncond": -1.2439289093017578, "incorrect_loss_uncond": -5.917553106943767}, "model_output": [{"sum_logits": -5.961923599243164, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -10.95141887664795, "logits_per_token": -2.980961799621582, "logits_per_char": -1.1923847198486328, "num_chars": 5}, {"sum_logits": -8.830228805541992, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.07415771484375, "logits_per_token": -8.830228805541992, "logits_per_char": -1.7660457611083984, "num_chars": 5}, {"sum_logits": -4.891180992126465, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.796552658081055, "logits_per_token": -4.891180992126465, "logits_per_char": -0.978236198425293, "num_chars": 5}, {"sum_logits": -4.100106716156006, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -10.95789909362793, "logits_per_token": -4.100106716156006, "logits_per_char": -0.8200213432312011, "num_chars": 5}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 973, "native_id": "Mercury_7218015", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.269004821777344, "incorrect_loss_raw": 13.566046396891275, "correct_loss_per_char": 0.3160436879033628, "incorrect_loss_per_char": 0.5631842348310683, "correct_loss_per_token": 1.817251205444336, "incorrect_loss_per_token": 3.682875262366401, "correct_loss_uncond": -15.478439331054688, "incorrect_loss_uncond": -9.066698710123697}, "model_output": [{"sum_logits": -11.531211853027344, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -22.476337432861328, "logits_per_token": -2.882802963256836, "logits_per_char": -0.41182899475097656, "num_chars": 28}, {"sum_logits": -18.67783546447754, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -23.41185760498047, "logits_per_token": -4.669458866119385, "logits_per_char": -0.7782431443532308, "num_chars": 24}, {"sum_logits": -10.489091873168945, "num_tokens": 3, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -22.010040283203125, "logits_per_token": -3.496363957722982, "logits_per_char": -0.4994805653889974, "num_chars": 21}, {"sum_logits": -7.269004821777344, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -22.74744415283203, "logits_per_token": -1.817251205444336, "logits_per_char": -0.3160436879033628, "num_chars": 23}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 974, "native_id": "Mercury_7109603", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.832633972167969, "incorrect_loss_raw": 29.57140858968099, "correct_loss_per_char": 0.20055311817233845, "incorrect_loss_per_char": 0.5148690894797997, "correct_loss_per_token": 1.0756939974698154, "incorrect_loss_per_token": 2.7621238149777803, "correct_loss_uncond": -14.216226577758789, "incorrect_loss_uncond": -10.207090377807617}, "model_output": [{"sum_logits": -11.832633972167969, "num_tokens": 11, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.048860549926758, "logits_per_token": -1.0756939974698154, "logits_per_char": -0.20055311817233845, "num_chars": 59}, {"sum_logits": -13.56155014038086, "num_tokens": 12, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -25.463449478149414, "logits_per_token": -1.1301291783650715, "logits_per_char": -0.22602583567301432, "num_chars": 60}, {"sum_logits": -39.49278259277344, "num_tokens": 11, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -48.12860870361328, "logits_per_token": -3.5902529629794033, "logits_per_char": -0.6582130432128906, "num_chars": 60}, {"sum_logits": -35.65989303588867, "num_tokens": 10, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -45.743438720703125, "logits_per_token": -3.565989303588867, "logits_per_char": -0.660368389553494, "num_chars": 54}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 975, "native_id": "NYSEDREGENTS_2008_8_42", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.905046463012695, "incorrect_loss_raw": 17.356101989746094, "correct_loss_per_char": 0.2976261615753174, "incorrect_loss_per_char": 0.6979354135953266, "correct_loss_per_token": 1.488130807876587, "incorrect_loss_per_token": 3.621588706970215, "correct_loss_uncond": -23.191438674926758, "incorrect_loss_uncond": -16.527506510416668}, "model_output": [{"sum_logits": -15.228233337402344, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -32.11225509643555, "logits_per_token": -3.807058334350586, "logits_per_char": -0.8014859651264391, "num_chars": 19}, {"sum_logits": -19.62158966064453, "num_tokens": 4, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -33.85193634033203, "logits_per_token": -4.905397415161133, "logits_per_char": -0.8918904391202059, "num_chars": 22}, {"sum_logits": -11.905046463012695, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.09648513793945, "logits_per_token": -1.488130807876587, "logits_per_char": -0.2976261615753174, "num_chars": 40}, {"sum_logits": -17.218482971191406, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.6866340637207, "logits_per_token": -2.152310371398926, "logits_per_char": -0.400429836539335, "num_chars": 43}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 976, "native_id": "NAEP_2000_8_S11+11", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.464282989501953, "incorrect_loss_raw": 22.210306803385418, "correct_loss_per_char": 0.3419311664722584, "incorrect_loss_per_char": 0.3182142586197551, "correct_loss_per_token": 1.5386902491251628, "incorrect_loss_per_token": 1.467187279354281, "correct_loss_uncond": -20.33692169189453, "incorrect_loss_uncond": -19.33258056640625}, "model_output": [{"sum_logits": -22.103809356689453, "num_tokens": 12, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -40.14830780029297, "logits_per_token": -1.8419841130574544, "logits_per_char": -0.40932980290165655, "num_chars": 54}, {"sum_logits": -18.464282989501953, "num_tokens": 12, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -38.801204681396484, "logits_per_token": -1.5386902491251628, "logits_per_char": -0.3419311664722584, "num_chars": 54}, {"sum_logits": -18.646129608154297, "num_tokens": 13, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -35.588172912597656, "logits_per_token": -1.4343176621657152, "logits_per_char": -0.3056742558713819, "num_chars": 61}, {"sum_logits": -25.8809814453125, "num_tokens": 23, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -48.892181396484375, "logits_per_token": -1.1252600628396738, "logits_per_char": -0.23963871708622686, "num_chars": 108}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 977, "native_id": "Mercury_7271670", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.753028869628906, "incorrect_loss_raw": 19.918053309122723, "correct_loss_per_char": 0.5938257217407227, "incorrect_loss_per_char": 0.5731128491442202, "correct_loss_per_token": 3.3932898385184154, "incorrect_loss_per_token": 2.7383358705611456, "correct_loss_uncond": -10.870006561279297, "incorrect_loss_uncond": -13.74173672993978}, "model_output": [{"sum_logits": -17.116222381591797, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.2698974609375, "logits_per_token": -2.4451746259416853, "logits_per_char": -0.5521362058577999, "num_chars": 31}, {"sum_logits": -17.99285316467285, "num_tokens": 8, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -33.019527435302734, "logits_per_token": -2.2491066455841064, "logits_per_char": -0.5804146182152533, "num_chars": 31}, {"sum_logits": -24.645084381103516, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.689945220947266, "logits_per_token": -3.520726340157645, "logits_per_char": -0.5867877233596075, "num_chars": 42}, {"sum_logits": -23.753028869628906, "num_tokens": 7, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -34.6230354309082, "logits_per_token": -3.3932898385184154, "logits_per_char": -0.5938257217407227, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 978, "native_id": "ACTAAP_2009_5_8", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.4632341861724854, "incorrect_loss_raw": 4.121363242467244, "correct_loss_per_char": 0.24632341861724855, "incorrect_loss_per_char": 0.38949635113988607, "correct_loss_per_token": 1.2316170930862427, "incorrect_loss_per_token": 3.2617250283559165, "correct_loss_uncond": -13.93905234336853, "incorrect_loss_uncond": -11.204638242721558}, "model_output": [{"sum_logits": -3.1778652667999268, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.608772277832031, "logits_per_token": -3.1778652667999268, "logits_per_char": -0.39723315834999084, "num_chars": 8}, {"sum_logits": -2.4632341861724854, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.402286529541016, "logits_per_token": -1.2316170930862427, "logits_per_char": -0.24632341861724855, "num_chars": 10}, {"sum_logits": -4.028395175933838, "num_tokens": 1, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -16.407533645629883, "logits_per_token": -4.028395175933838, "logits_per_char": -0.40283951759338377, "num_chars": 10}, {"sum_logits": -5.157829284667969, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.961698532104492, "logits_per_token": -2.5789146423339844, "logits_per_char": -0.3684163774762835, "num_chars": 14}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 979, "native_id": "NYSEDREGENTS_2012_4_1", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.46725082397461, "incorrect_loss_raw": 12.267269134521484, "correct_loss_per_char": 1.446725082397461, "incorrect_loss_per_char": 1.00482936227794, "correct_loss_per_token": 7.233625411987305, "incorrect_loss_per_token": 6.133634567260742, "correct_loss_uncond": -3.465839385986328, "incorrect_loss_uncond": -7.167594909667969}, "model_output": [{"sum_logits": -12.264266967773438, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -19.36490249633789, "logits_per_token": -6.132133483886719, "logits_per_char": -0.9434051513671875, "num_chars": 13}, {"sum_logits": -14.46725082397461, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -17.933090209960938, "logits_per_token": -7.233625411987305, "logits_per_char": -1.446725082397461, "num_chars": 10}, {"sum_logits": -13.125957489013672, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -22.099275588989258, "logits_per_token": -6.562978744506836, "logits_per_char": -1.1932688626376065, "num_chars": 11}, {"sum_logits": -11.411582946777344, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -16.84041404724121, "logits_per_token": -5.705791473388672, "logits_per_char": -0.8778140728290265, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 980, "native_id": "Mercury_SC_409030", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.99094009399414, "incorrect_loss_raw": 17.09159978230794, "correct_loss_per_char": 1.262681057578639, "incorrect_loss_per_char": 1.156796819607095, "correct_loss_per_token": 5.997735023498535, "incorrect_loss_per_token": 6.92031733194987, "correct_loss_uncond": 3.106403350830078, "incorrect_loss_uncond": -0.3314231236775716}, "model_output": [{"sum_logits": -15.159133911132812, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -16.111740112304688, "logits_per_token": -7.579566955566406, "logits_per_char": -1.1660872239332933, "num_chars": 13}, {"sum_logits": -16.609874725341797, "num_tokens": 2, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -14.84533977508545, "logits_per_token": -8.304937362670898, "logits_per_char": -1.2776826711801381, "num_chars": 13}, {"sum_logits": -19.50579071044922, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -21.311988830566406, "logits_per_token": -4.876447677612305, "logits_per_char": -1.0266205637078536, "num_chars": 19}, {"sum_logits": -23.99094009399414, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -20.884536743164062, "logits_per_token": -5.997735023498535, "logits_per_char": -1.262681057578639, "num_chars": 19}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 981, "native_id": "MEA_2013_8_8", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.571561336517334, "incorrect_loss_raw": 3.6222667694091797, "correct_loss_per_char": 1.190520445505778, "incorrect_loss_per_char": 1.2074222564697266, "correct_loss_per_token": 3.571561336517334, "incorrect_loss_per_token": 3.6222667694091797, "correct_loss_uncond": -4.018347263336182, "incorrect_loss_uncond": -4.159836769104004}, "model_output": [{"sum_logits": -4.563753604888916, "num_tokens": 1, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -7.743196487426758, "logits_per_token": -4.563753604888916, "logits_per_char": -1.5212512016296387, "num_chars": 3}, {"sum_logits": -2.5948266983032227, "num_tokens": 1, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -7.763138294219971, "logits_per_token": -2.5948266983032227, "logits_per_char": -0.8649422327677408, "num_chars": 3}, {"sum_logits": -3.7082200050354004, "num_tokens": 1, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -7.839975833892822, "logits_per_token": -3.7082200050354004, "logits_per_char": -1.2360733350118, "num_chars": 3}, {"sum_logits": -3.571561336517334, "num_tokens": 1, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -7.589908599853516, "logits_per_token": -3.571561336517334, "logits_per_char": -1.190520445505778, "num_chars": 3}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 982, "native_id": "Mercury_7140333", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.982160568237305, "incorrect_loss_raw": 28.3336607615153, "correct_loss_per_char": 0.5169338029006432, "incorrect_loss_per_char": 0.5784569643545843, "correct_loss_per_token": 2.7256509607488457, "incorrect_loss_per_token": 3.0038292006611425, "correct_loss_uncond": -16.415098190307617, "incorrect_loss_uncond": -12.83070437113444}, "model_output": [{"sum_logits": -20.326967239379883, "num_tokens": 8, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -37.83885955810547, "logits_per_token": -2.5408709049224854, "logits_per_char": -0.44189059216043225, "num_chars": 46}, {"sum_logits": -29.26245880126953, "num_tokens": 9, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -39.70205307006836, "logits_per_token": -3.25138431125217, "logits_per_char": -0.5852491760253906, "num_chars": 50}, {"sum_logits": -35.411556243896484, "num_tokens": 11, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -45.95218276977539, "logits_per_token": -3.2192323858087715, "logits_per_char": -0.7082311248779297, "num_chars": 50}, {"sum_logits": -29.982160568237305, "num_tokens": 11, "num_tokens_all": 226, "is_greedy": false, "sum_logits_uncond": -46.39725875854492, "logits_per_token": -2.7256509607488457, "logits_per_char": -0.5169338029006432, "num_chars": 58}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 983, "native_id": "Mercury_SC_LBS10664", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.824991703033447, "incorrect_loss_raw": 5.201512336730957, "correct_loss_per_char": 0.7281239628791809, "incorrect_loss_per_char": 0.7085627853555024, "correct_loss_per_token": 5.824991703033447, "incorrect_loss_per_token": 5.201512336730957, "correct_loss_uncond": -6.768197536468506, "incorrect_loss_uncond": -6.333644549051921}, "model_output": [{"sum_logits": -6.29768705368042, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.27128791809082, "logits_per_token": -6.29768705368042, "logits_per_char": -0.6997430059644911, "num_chars": 9}, {"sum_logits": -5.25824499130249, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.445188522338867, "logits_per_token": -5.25824499130249, "logits_per_char": -0.7511778559003558, "num_chars": 7}, {"sum_logits": -5.824991703033447, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.593189239501953, "logits_per_token": -5.824991703033447, "logits_per_char": -0.7281239628791809, "num_chars": 8}, {"sum_logits": -4.048604965209961, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -4.048604965209961, "logits_per_char": -0.6747674942016602, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 984, "native_id": "Mercury_7171430", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 2.757258176803589, "incorrect_loss_raw": 5.26278813680013, "correct_loss_per_char": 0.39389402525765554, "incorrect_loss_per_char": 0.77284926838345, "correct_loss_per_token": 1.3786290884017944, "incorrect_loss_per_token": 2.631394068400065, "correct_loss_uncond": -14.450198411941528, "incorrect_loss_uncond": -11.45852533976237}, "model_output": [{"sum_logits": -7.508310317993164, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -18.426288604736328, "logits_per_token": -3.754155158996582, "logits_per_char": -0.9385387897491455, "num_chars": 8}, {"sum_logits": -2.757258176803589, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -17.207456588745117, "logits_per_token": -1.3786290884017944, "logits_per_char": -0.39389402525765554, "num_chars": 7}, {"sum_logits": -6.124814987182617, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -16.82785987854004, "logits_per_token": -3.0624074935913086, "logits_per_char": -1.0208024978637695, "num_chars": 6}, {"sum_logits": -2.1552391052246094, "num_tokens": 2, "num_tokens_all": 214, "is_greedy": true, "sum_logits_uncond": -14.909791946411133, "logits_per_token": -1.0776195526123047, "logits_per_char": -0.3592065175374349, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 985, "native_id": "Mercury_SC_407572", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.051312446594238, "incorrect_loss_raw": 11.43609650929769, "correct_loss_per_char": 0.2971360262702493, "incorrect_loss_per_char": 0.472532087507702, "correct_loss_per_token": 1.6837708155314128, "incorrect_loss_per_token": 3.165843645731608, "correct_loss_uncond": -18.063244819641113, "incorrect_loss_uncond": -8.390254020690918}, "model_output": [{"sum_logits": -15.211080551147461, "num_tokens": 4, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -18.984785079956055, "logits_per_token": -3.8027701377868652, "logits_per_char": -0.543252876826695, "num_chars": 28}, {"sum_logits": -8.051706314086914, "num_tokens": 4, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -22.692081451416016, "logits_per_token": -2.0129265785217285, "logits_per_char": -0.32206825256347654, "num_chars": 25}, {"sum_logits": -11.045502662658691, "num_tokens": 3, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -17.80218505859375, "logits_per_token": -3.6818342208862305, "logits_per_char": -0.5522751331329345, "num_chars": 20}, {"sum_logits": -5.051312446594238, "num_tokens": 3, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -23.11455726623535, "logits_per_token": -1.6837708155314128, "logits_per_char": -0.2971360262702493, "num_chars": 17}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 986, "native_id": "VASoL_2009_3_2", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.117359161376953, "incorrect_loss_raw": 15.843960762023926, "correct_loss_per_char": 1.162873781644381, "incorrect_loss_per_char": 1.2424251320015671, "correct_loss_per_token": 5.039119720458984, "incorrect_loss_per_token": 5.281320254007976, "correct_loss_uncond": -2.9465160369873047, "incorrect_loss_uncond": -3.637836774190267}, "model_output": [{"sum_logits": -11.07237434387207, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -15.525430679321289, "logits_per_token": -3.690791447957357, "logits_per_char": -0.9226978619893392, "num_chars": 12}, {"sum_logits": -20.698396682739258, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -24.125425338745117, "logits_per_token": -6.899465560913086, "logits_per_char": -1.5921843602107122, "num_chars": 13}, {"sum_logits": -15.76111125946045, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.794536590576172, "logits_per_token": -5.253703753153483, "logits_per_char": -1.2123931738046498, "num_chars": 13}, {"sum_logits": -15.117359161376953, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -18.063875198364258, "logits_per_token": -5.039119720458984, "logits_per_char": -1.162873781644381, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 987, "native_id": "Mercury_SC_407383", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.9007568359375, "incorrect_loss_raw": 24.620627721150715, "correct_loss_per_char": 0.8436091496394231, "incorrect_loss_per_char": 0.5909340756038522, "correct_loss_per_token": 3.6556396484375, "incorrect_loss_per_token": 2.238238883740974, "correct_loss_uncond": -7.538749694824219, "incorrect_loss_uncond": -8.027624130249023}, "model_output": [{"sum_logits": -20.65873146057129, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -32.23764419555664, "logits_per_token": -1.8780664964155718, "logits_per_char": -0.5738536516825358, "num_chars": 36}, {"sum_logits": -26.764728546142578, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -29.74938201904297, "logits_per_token": -2.433157140558416, "logits_per_char": -0.5694623094923953, "num_chars": 47}, {"sum_logits": -26.43842315673828, "num_tokens": 11, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -35.95772933959961, "logits_per_token": -2.4034930142489346, "logits_per_char": -0.6294862656366258, "num_chars": 42}, {"sum_logits": -32.9007568359375, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -40.43950653076172, "logits_per_token": -3.6556396484375, "logits_per_char": -0.8436091496394231, "num_chars": 39}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 988, "native_id": "Mercury_7218400", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 29.682950973510742, "incorrect_loss_raw": 21.178682963053387, "correct_loss_per_char": 0.5207535258510656, "incorrect_loss_per_char": 0.5631006192990101, "correct_loss_per_token": 3.298105663723416, "incorrect_loss_per_token": 3.01084344651964, "correct_loss_uncond": -20.181787490844727, "incorrect_loss_uncond": -10.878657658894857}, "model_output": [{"sum_logits": -23.3521671295166, "num_tokens": 10, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -39.232688903808594, "logits_per_token": -2.3352167129516603, "logits_per_char": -0.3766478569276871, "num_chars": 62}, {"sum_logits": -29.682950973510742, "num_tokens": 9, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -49.86473846435547, "logits_per_token": -3.298105663723416, "logits_per_char": -0.5207535258510656, "num_chars": 57}, {"sum_logits": -24.932113647460938, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -27.105987548828125, "logits_per_token": -4.155352274576823, "logits_per_char": -0.804261730563256, "num_chars": 31}, {"sum_logits": -15.251768112182617, "num_tokens": 6, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -29.833345413208008, "logits_per_token": -2.541961352030436, "logits_per_char": -0.5083922704060873, "num_chars": 30}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 989, "native_id": "Mercury_184818", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.614555358886719, "incorrect_loss_raw": 13.276564598083496, "correct_loss_per_char": 0.7307277679443359, "incorrect_loss_per_char": 0.737351120536637, "correct_loss_per_token": 3.6536388397216797, "incorrect_loss_per_token": 3.319141149520874, "correct_loss_uncond": -12.578622817993164, "incorrect_loss_uncond": -13.259151140848795}, "model_output": [{"sum_logits": -11.807119369506836, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -26.599796295166016, "logits_per_token": -2.951779842376709, "logits_per_char": -0.6945364335004021, "num_chars": 17}, {"sum_logits": -13.438124656677246, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -25.947914123535156, "logits_per_token": -3.3595311641693115, "logits_per_char": -0.7072697187724867, "num_chars": 19}, {"sum_logits": -14.584449768066406, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -27.059436798095703, "logits_per_token": -3.6461124420166016, "logits_per_char": -0.8102472093370225, "num_chars": 18}, {"sum_logits": -14.614555358886719, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -27.193178176879883, "logits_per_token": -3.6536388397216797, "logits_per_char": -0.7307277679443359, "num_chars": 20}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 990, "native_id": "Mercury_SC_405931", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 29.78791618347168, "incorrect_loss_raw": 29.954846064249676, "correct_loss_per_char": 0.5415984760631215, "incorrect_loss_per_char": 0.6361715158121631, "correct_loss_per_token": 3.309768464830187, "incorrect_loss_per_token": 3.735433945580134, "correct_loss_uncond": -7.44883918762207, "incorrect_loss_uncond": -9.203926722208658}, "model_output": [{"sum_logits": -25.726200103759766, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -33.7215576171875, "logits_per_token": -3.675171443394252, "logits_per_char": -0.5846863659945402, "num_chars": 44}, {"sum_logits": -29.13468360900879, "num_tokens": 8, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -40.99658203125, "logits_per_token": -3.6418354511260986, "logits_per_char": -0.5945853797756896, "num_chars": 49}, {"sum_logits": -35.00365447998047, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -42.7581787109375, "logits_per_token": -3.8892949422200522, "logits_per_char": -0.7292428016662598, "num_chars": 48}, {"sum_logits": -29.78791618347168, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -37.23675537109375, "logits_per_token": -3.309768464830187, "logits_per_char": -0.5415984760631215, "num_chars": 55}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 991, "native_id": "Mercury_SC_416177", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.959205627441406, "incorrect_loss_raw": 16.71915054321289, "correct_loss_per_char": 0.47836822509765625, "incorrect_loss_per_char": 0.6164104387955965, "correct_loss_per_token": 1.9932009379069011, "incorrect_loss_per_token": 3.0695610470241967, "correct_loss_uncond": -21.71230697631836, "incorrect_loss_uncond": -14.23561159769694}, "model_output": [{"sum_logits": -24.684215545654297, "num_tokens": 6, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -34.74755859375, "logits_per_token": -4.114035924275716, "logits_per_char": -0.8228071848551433, "num_chars": 30}, {"sum_logits": -12.591997146606445, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -29.98012924194336, "logits_per_token": -2.5183994293212892, "logits_per_char": -0.4663702646891276, "num_chars": 27}, {"sum_logits": -11.959205627441406, "num_tokens": 6, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -33.671512603759766, "logits_per_token": -1.9932009379069011, "logits_per_char": -0.47836822509765625, "num_chars": 25}, {"sum_logits": -12.88123893737793, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -28.136598587036133, "logits_per_token": -2.576247787475586, "logits_per_char": -0.5600538668425187, "num_chars": 23}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 992, "native_id": "Mercury_SC_406625", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.724594116210938, "incorrect_loss_raw": 20.153109868367512, "correct_loss_per_char": 0.7908198038736979, "incorrect_loss_per_char": 0.8208267309345724, "correct_loss_per_token": 3.389227730887277, "incorrect_loss_per_token": 3.593101183573405, "correct_loss_uncond": -4.606868743896484, "incorrect_loss_uncond": -7.5083974202473955}, "model_output": [{"sum_logits": -16.540584564208984, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -29.56511688232422, "logits_per_token": -2.7567640940348306, "logits_per_char": -0.5907351630074638, "num_chars": 28}, {"sum_logits": -23.724594116210938, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.331462860107422, "logits_per_token": -3.389227730887277, "logits_per_char": -0.7908198038736979, "num_chars": 30}, {"sum_logits": -22.836286544799805, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -28.39010238647461, "logits_per_token": -3.8060477574666343, "logits_per_char": -0.9134514617919922, "num_chars": 25}, {"sum_logits": -21.08245849609375, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -25.0293025970459, "logits_per_token": -4.21649169921875, "logits_per_char": -0.9582935680042614, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 993, "native_id": "MCAS_2014_8_16", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.854262351989746, "incorrect_loss_raw": 13.206568400065104, "correct_loss_per_char": 0.16934660502842494, "incorrect_loss_per_char": 0.18866526285807292, "correct_loss_per_token": 1.0776602138172497, "incorrect_loss_per_token": 1.2005971272786458, "correct_loss_uncond": -25.29335880279541, "incorrect_loss_uncond": -23.812278747558594}, "model_output": [{"sum_logits": -13.953460693359375, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -36.876426696777344, "logits_per_token": -1.2684964266690342, "logits_per_char": -0.19933515276227678, "num_chars": 70}, {"sum_logits": -11.885961532592773, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -36.171356201171875, "logits_per_token": -1.0805419575084338, "logits_per_char": -0.16979945046561104, "num_chars": 70}, {"sum_logits": -13.780282974243164, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -38.008758544921875, "logits_per_token": -1.2527529976584695, "logits_per_char": -0.1968611853463309, "num_chars": 70}, {"sum_logits": -11.854262351989746, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -37.147621154785156, "logits_per_token": -1.0776602138172497, "logits_per_char": -0.16934660502842494, "num_chars": 70}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 994, "native_id": "Mercury_7138460", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 4.114333152770996, "incorrect_loss_raw": 4.259639898935954, "correct_loss_per_char": 0.5142916440963745, "incorrect_loss_per_char": 0.45131961544756133, "correct_loss_per_token": 4.114333152770996, "incorrect_loss_per_token": 4.259639898935954, "correct_loss_uncond": -10.186675071716309, "incorrect_loss_uncond": -8.651297092437744}, "model_output": [{"sum_logits": -2.757215976715088, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -12.414356231689453, "logits_per_token": -2.757215976715088, "logits_per_char": -0.30635733074612087, "num_chars": 9}, {"sum_logits": -6.016602516174316, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -13.054412841796875, "logits_per_token": -6.016602516174316, "logits_per_char": -0.546963865106756, "num_chars": 11}, {"sum_logits": -4.114333152770996, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -14.301008224487305, "logits_per_token": -4.114333152770996, "logits_per_char": -0.5142916440963745, "num_chars": 8}, {"sum_logits": -4.005101203918457, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -13.264041900634766, "logits_per_token": -4.005101203918457, "logits_per_char": -0.5006376504898071, "num_chars": 8}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 995, "native_id": "Mercury_7129640", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.002357482910156, "incorrect_loss_raw": 19.377167065938313, "correct_loss_per_char": 0.538521986741286, "incorrect_loss_per_char": 0.4870872813887912, "correct_loss_per_token": 3.0003367832728793, "incorrect_loss_per_token": 2.4977936769919418, "correct_loss_uncond": -10.691463470458984, "incorrect_loss_uncond": -13.546381314595541}, "model_output": [{"sum_logits": -25.55025291442871, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -42.83616638183594, "logits_per_token": -2.838916990492079, "logits_per_char": -0.5806875662370161, "num_chars": 44}, {"sum_logits": -21.002357482910156, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -31.69382095336914, "logits_per_token": -3.0003367832728793, "logits_per_char": -0.538521986741286, "num_chars": 39}, {"sum_logits": -14.766314506530762, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -28.023332595825195, "logits_per_token": -2.109473500932966, "logits_per_char": -0.39908958125758814, "num_chars": 37}, {"sum_logits": -17.81493377685547, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -27.91114616394043, "logits_per_token": -2.5449905395507812, "logits_per_char": -0.4814846966717694, "num_chars": 37}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 996, "native_id": "Mercury_7024290", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.904876708984375, "incorrect_loss_raw": 3.2160131136576333, "correct_loss_per_char": 1.1131095886230469, "incorrect_loss_per_char": 0.40910938177159223, "correct_loss_per_token": 8.904876708984375, "incorrect_loss_per_token": 3.2160131136576333, "correct_loss_uncond": -8.041126251220703, "incorrect_loss_uncond": -11.046230792999268}, "model_output": [{"sum_logits": -3.6045477390289307, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -14.513389587402344, "logits_per_token": -3.6045477390289307, "logits_per_char": -0.5149353912898472, "num_chars": 7}, {"sum_logits": -8.904876708984375, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.946002960205078, "logits_per_token": -8.904876708984375, "logits_per_char": -1.1131095886230469, "num_chars": 8}, {"sum_logits": -2.944345474243164, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.608772277832031, "logits_per_token": -2.944345474243164, "logits_per_char": -0.3680431842803955, "num_chars": 8}, {"sum_logits": -3.0991461277008057, "num_tokens": 1, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -12.664569854736328, "logits_per_token": -3.0991461277008057, "logits_per_char": -0.34434956974453396, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 997, "native_id": "NYSEDREGENTS_2008_4_28", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 27.05853271484375, "incorrect_loss_raw": 26.602729161580402, "correct_loss_per_char": 0.6764633178710937, "incorrect_loss_per_char": 0.756515194373085, "correct_loss_per_token": 2.705853271484375, "incorrect_loss_per_token": 2.830843398868035, "correct_loss_uncond": -10.726570129394531, "incorrect_loss_uncond": -5.081006368001302}, "model_output": [{"sum_logits": -20.60900115966797, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -26.209596633911133, "logits_per_token": -2.576125144958496, "logits_per_char": -0.6061470929314109, "num_chars": 34}, {"sum_logits": -32.73347473144531, "num_tokens": 11, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -35.784976959228516, "logits_per_token": -2.975770430131392, "logits_per_char": -0.8614072297748766, "num_chars": 38}, {"sum_logits": -27.05853271484375, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -37.78510284423828, "logits_per_token": -2.705853271484375, "logits_per_char": -0.6764633178710937, "num_chars": 40}, {"sum_logits": -26.46571159362793, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -33.05663299560547, "logits_per_token": -2.9406346215142145, "logits_per_char": -0.8019912604129675, "num_chars": 33}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 998, "native_id": "Mercury_SC_414339", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 13.034698486328125, "incorrect_loss_raw": 15.159403800964355, "correct_loss_per_char": 0.28965996636284724, "incorrect_loss_per_char": 0.32740923939397526, "correct_loss_per_token": 1.1849725896661931, "incorrect_loss_per_token": 1.378127618269487, "correct_loss_uncond": -16.44208526611328, "incorrect_loss_uncond": -16.476045926411945}, "model_output": [{"sum_logits": -15.848833084106445, "num_tokens": 11, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -30.725749969482422, "logits_per_token": -1.4408030076460405, "logits_per_char": -0.33720921455545627, "num_chars": 47}, {"sum_logits": -14.18332290649414, "num_tokens": 11, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -31.175121307373047, "logits_per_token": -1.2893929914994673, "logits_per_char": -0.30177282779774767, "num_chars": 47}, {"sum_logits": -13.034698486328125, "num_tokens": 11, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -29.476783752441406, "logits_per_token": -1.1849725896661931, "logits_per_char": -0.28965996636284724, "num_chars": 45}, {"sum_logits": -15.44605541229248, "num_tokens": 11, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -33.00547790527344, "logits_per_token": -1.4041868556629529, "logits_per_char": -0.3432456758287218, "num_chars": 45}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 999, "native_id": "LEAP_2000_8_2", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 30.367454528808594, "incorrect_loss_raw": 29.265262603759766, "correct_loss_per_char": 0.6326553026835123, "incorrect_loss_per_char": 0.6044602703374657, "correct_loss_per_token": 2.0244969685872394, "incorrect_loss_per_token": 2.269273955634887, "correct_loss_uncond": -24.3212890625, "incorrect_loss_uncond": -23.369222005208332}, "model_output": [{"sum_logits": -30.367454528808594, "num_tokens": 15, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -54.688743591308594, "logits_per_token": -2.0244969685872394, "logits_per_char": -0.6326553026835123, "num_chars": 48}, {"sum_logits": -27.963790893554688, "num_tokens": 11, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -49.80530548095703, "logits_per_token": -2.5421628085049717, "logits_per_char": -0.6990947723388672, "num_chars": 40}, {"sum_logits": -26.993778228759766, "num_tokens": 13, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -54.52492904663086, "logits_per_token": -2.0764444791353664, "logits_per_char": -0.4575216648942333, "num_chars": 59}, {"sum_logits": -32.838218688964844, "num_tokens": 15, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -53.573219299316406, "logits_per_token": -2.189214579264323, "logits_per_char": -0.6567643737792969, "num_chars": 50}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1000, "native_id": "Mercury_7172270", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 23.12493324279785, "incorrect_loss_raw": 35.726765950520836, "correct_loss_per_char": 0.4719374131183235, "incorrect_loss_per_char": 0.6392310294113231, "correct_loss_per_token": 2.569437026977539, "incorrect_loss_per_token": 3.830399181224682, "correct_loss_uncond": -14.287580490112305, "incorrect_loss_uncond": -20.889484405517578}, "model_output": [{"sum_logits": -23.12493324279785, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -37.412513732910156, "logits_per_token": -2.569437026977539, "logits_per_char": -0.4719374131183235, "num_chars": 49}, {"sum_logits": -37.59519958496094, "num_tokens": 10, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -57.47307586669922, "logits_per_token": -3.7595199584960937, "logits_per_char": -0.7093433883954894, "num_chars": 53}, {"sum_logits": -28.453506469726562, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -54.74437713623047, "logits_per_token": -3.161500718858507, "logits_per_char": -0.49918432403029056, "num_chars": 57}, {"sum_logits": -41.131591796875, "num_tokens": 9, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -57.63129806518555, "logits_per_token": -4.570176866319445, "logits_per_char": -0.7091653758081896, "num_chars": 58}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1001, "native_id": "Mercury_184205", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.257020950317383, "incorrect_loss_raw": 16.83787790934245, "correct_loss_per_char": 0.4606494903564453, "incorrect_loss_per_char": 0.3304391859067824, "correct_loss_per_token": 2.6257020950317385, "incorrect_loss_per_token": 1.4399558227626006, "correct_loss_uncond": -12.262243270874023, "incorrect_loss_uncond": -18.24871826171875}, "model_output": [{"sum_logits": -9.02029800415039, "num_tokens": 9, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.63890838623047, "logits_per_token": -1.002255333794488, "logits_per_char": -0.23737626326711556, "num_chars": 38}, {"sum_logits": -19.627464294433594, "num_tokens": 12, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -38.839683532714844, "logits_per_token": -1.6356220245361328, "logits_per_char": -0.3703295149893131, "num_chars": 53}, {"sum_logits": -26.257020950317383, "num_tokens": 10, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -38.519264221191406, "logits_per_token": -2.6257020950317385, "logits_per_char": -0.4606494903564453, "num_chars": 57}, {"sum_logits": -21.86587142944336, "num_tokens": 13, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -40.78119659423828, "logits_per_token": -1.6819901099571815, "logits_per_char": -0.3836117794639186, "num_chars": 57}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1002, "native_id": "Mercury_SC_400683", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.696891784667969, "incorrect_loss_raw": 13.865544001261393, "correct_loss_per_char": 0.40248440473507613, "incorrect_loss_per_char": 0.4501457540894986, "correct_loss_per_token": 2.242413112095424, "incorrect_loss_per_token": 1.9511432420639767, "correct_loss_uncond": -15.596502304077148, "incorrect_loss_uncond": -12.657109578450521}, "model_output": [{"sum_logits": -15.696891784667969, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -31.293394088745117, "logits_per_token": -2.242413112095424, "logits_per_char": -0.40248440473507613, "num_chars": 39}, {"sum_logits": -20.35232162475586, "num_tokens": 8, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -29.65361785888672, "logits_per_token": -2.5440402030944824, "logits_per_char": -0.4963980884086795, "num_chars": 41}, {"sum_logits": -9.715812683105469, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -23.711355209350586, "logits_per_token": -1.3879732404436385, "logits_per_char": -0.37368510319636417, "num_chars": 26}, {"sum_logits": -11.528497695922852, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -26.202987670898438, "logits_per_token": -1.9214162826538086, "logits_per_char": -0.48035407066345215, "num_chars": 24}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1003, "native_id": "Mercury_7182210", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 22.688064575195312, "incorrect_loss_raw": 29.609521230061848, "correct_loss_per_char": 0.37813440958658856, "incorrect_loss_per_char": 0.47381192037215863, "correct_loss_per_token": 2.2688064575195312, "incorrect_loss_per_token": 2.6261150861027267, "correct_loss_uncond": -14.811725616455078, "incorrect_loss_uncond": -18.0477778116862}, "model_output": [{"sum_logits": -26.001190185546875, "num_tokens": 12, "num_tokens_all": 249, "is_greedy": false, "sum_logits_uncond": -43.390846252441406, "logits_per_token": -2.1667658487955728, "logits_per_char": -0.4333531697591146, "num_chars": 60}, {"sum_logits": -22.688064575195312, "num_tokens": 10, "num_tokens_all": 247, "is_greedy": false, "sum_logits_uncond": -37.49979019165039, "logits_per_token": -2.2688064575195312, "logits_per_char": -0.37813440958658856, "num_chars": 60}, {"sum_logits": -33.93547821044922, "num_tokens": 11, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -51.353118896484375, "logits_per_token": -3.0850434736772017, "logits_per_char": -0.5220842801607573, "num_chars": 65}, {"sum_logits": -28.891895294189453, "num_tokens": 11, "num_tokens_all": 248, "is_greedy": false, "sum_logits_uncond": -48.22793197631836, "logits_per_token": -2.626535935835405, "logits_per_char": -0.46599831119660406, "num_chars": 62}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1004, "native_id": "Mercury_7238945", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.9238221645355225, "incorrect_loss_raw": 2.876751979192098, "correct_loss_per_char": 1.3079407215118408, "incorrect_loss_per_char": 1.1388535499572754, "correct_loss_per_token": 1.9619110822677612, "incorrect_loss_per_token": 2.4343510468800864, "correct_loss_uncond": -10.146229028701782, "incorrect_loss_uncond": -7.409682830174764}, "model_output": [{"sum_logits": -3.238852024078369, "num_tokens": 1, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -5.797092437744141, "logits_per_token": -3.238852024078369, "logits_per_char": -1.6194260120391846, "num_chars": 2}, {"sum_logits": -2.6544055938720703, "num_tokens": 2, "num_tokens_all": 239, "is_greedy": true, "sum_logits_uncond": -14.114334106445312, "logits_per_token": -1.3272027969360352, "logits_per_char": -0.8848018646240234, "num_chars": 3}, {"sum_logits": -3.9238221645355225, "num_tokens": 2, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -14.070051193237305, "logits_per_token": -1.9619110822677612, "logits_per_char": -1.3079407215118408, "num_chars": 3}, {"sum_logits": -2.7369983196258545, "num_tokens": 1, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -10.947877883911133, "logits_per_token": -2.7369983196258545, "logits_per_char": -0.9123327732086182, "num_chars": 3}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1005, "native_id": "Mercury_SC_408748", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.226207733154297, "incorrect_loss_raw": 17.685277938842773, "correct_loss_per_char": 0.41423199393532495, "incorrect_loss_per_char": 0.47077552700591196, "correct_loss_per_token": 2.0251341925726996, "incorrect_loss_per_token": 2.526468276977539, "correct_loss_uncond": -9.35096549987793, "incorrect_loss_uncond": -9.715574264526367}, "model_output": [{"sum_logits": -18.88847541809082, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.346893310546875, "logits_per_token": -2.6983536311558316, "logits_per_char": -0.55554339464973, "num_chars": 34}, {"sum_logits": -18.734668731689453, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.390247344970703, "logits_per_token": -2.6763812473842075, "logits_per_char": -0.4803761213253706, "num_chars": 39}, {"sum_logits": -15.432689666748047, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -27.465415954589844, "logits_per_token": -2.204669952392578, "logits_per_char": -0.37640706504263527, "num_chars": 41}, {"sum_logits": -18.226207733154297, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -27.577173233032227, "logits_per_token": -2.0251341925726996, "logits_per_char": -0.41423199393532495, "num_chars": 44}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1006, "native_id": "MEA_2016_5_4", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.95342445373535, "incorrect_loss_raw": 16.028010368347168, "correct_loss_per_char": 0.5129549843924386, "incorrect_loss_per_char": 0.4811771434766275, "correct_loss_per_token": 1.6321294957941228, "incorrect_loss_per_token": 1.7178558632179541, "correct_loss_uncond": -13.484136581420898, "incorrect_loss_uncond": -15.22783120473226}, "model_output": [{"sum_logits": -11.735526084899902, "num_tokens": 9, "num_tokens_all": 250, "is_greedy": false, "sum_logits_uncond": -27.368785858154297, "logits_per_token": -1.303947342766656, "logits_per_char": -0.36673519015312195, "num_chars": 32}, {"sum_logits": -19.32927703857422, "num_tokens": 9, "num_tokens_all": 250, "is_greedy": false, "sum_logits_uncond": -32.08859634399414, "logits_per_token": -2.1476974487304688, "logits_per_char": -0.6040399074554443, "num_chars": 32}, {"sum_logits": -17.019227981567383, "num_tokens": 10, "num_tokens_all": 251, "is_greedy": false, "sum_logits_uncond": -34.310142517089844, "logits_per_token": -1.7019227981567382, "logits_per_char": -0.4727563328213162, "num_chars": 36}, {"sum_logits": -17.95342445373535, "num_tokens": 11, "num_tokens_all": 252, "is_greedy": false, "sum_logits_uncond": -31.43756103515625, "logits_per_token": -1.6321294957941228, "logits_per_char": -0.5129549843924386, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1007, "native_id": "Mercury_7271513", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 18.66850471496582, "incorrect_loss_raw": 24.769105911254883, "correct_loss_per_char": 0.3809898921421596, "incorrect_loss_per_char": 0.5856416432463488, "correct_loss_per_token": 2.074278301662869, "incorrect_loss_per_token": 3.0142839371211942, "correct_loss_uncond": -16.936288833618164, "incorrect_loss_uncond": -9.820306142171225}, "model_output": [{"sum_logits": -12.594572067260742, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -21.879865646362305, "logits_per_token": -1.799224581037249, "logits_per_char": -0.37042859021355123, "num_chars": 34}, {"sum_logits": -33.87355041503906, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -40.22977066040039, "logits_per_token": -3.763727823893229, "logits_per_char": -0.8065131051199776, "num_chars": 42}, {"sum_logits": -27.839195251464844, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -41.658599853515625, "logits_per_token": -3.4798994064331055, "logits_per_char": -0.5799832344055176, "num_chars": 48}, {"sum_logits": -18.66850471496582, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -35.604793548583984, "logits_per_token": -2.074278301662869, "logits_per_char": -0.3809898921421596, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1008, "native_id": "Mercury_7189000", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.944559097290039, "incorrect_loss_raw": 12.245890617370605, "correct_loss_per_char": 0.5153296240444841, "incorrect_loss_per_char": 0.551420760402669, "correct_loss_per_token": 3.7361397743225098, "incorrect_loss_per_token": 2.5924168480767147, "correct_loss_uncond": -16.76292610168457, "incorrect_loss_uncond": -13.282695452372232}, "model_output": [{"sum_logits": -14.944559097290039, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.70748519897461, "logits_per_token": -3.7361397743225098, "logits_per_char": -0.5153296240444841, "num_chars": 29}, {"sum_logits": -7.243344306945801, "num_tokens": 6, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -25.832347869873047, "logits_per_token": -1.2072240511576335, "logits_per_char": -0.24977049334295864, "num_chars": 29}, {"sum_logits": -13.423219680786133, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -22.40426254272461, "logits_per_token": -3.355804920196533, "logits_per_char": -0.639200937180292, "num_chars": 21}, {"sum_logits": -16.071107864379883, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -28.34914779663086, "logits_per_token": -3.2142215728759767, "logits_per_char": -0.7652908506847563, "num_chars": 21}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1009, "native_id": "Mercury_SC_401585", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.7956427335739136, "incorrect_loss_raw": 3.3326830863952637, "correct_loss_per_char": 0.2565203905105591, "incorrect_loss_per_char": 0.5550841603960309, "correct_loss_per_token": 1.7956427335739136, "incorrect_loss_per_token": 3.3326830863952637, "correct_loss_uncond": -11.230610013008118, "incorrect_loss_uncond": -9.864983081817627}, "model_output": [{"sum_logits": -1.7956427335739136, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": true, "sum_logits_uncond": -13.026252746582031, "logits_per_token": -1.7956427335739136, "logits_per_char": -0.2565203905105591, "num_chars": 7}, {"sum_logits": -4.146795272827148, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -12.807668685913086, "logits_per_token": -4.146795272827148, "logits_per_char": -0.8293590545654297, "num_chars": 5}, {"sum_logits": -3.067934036254883, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.331348419189453, "logits_per_token": -3.067934036254883, "logits_per_char": -0.4382762908935547, "num_chars": 7}, {"sum_logits": -2.7833199501037598, "num_tokens": 1, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -13.453981399536133, "logits_per_token": -2.7833199501037598, "logits_per_char": -0.39761713572910856, "num_chars": 7}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1010, "native_id": "Mercury_188528", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 6.550230979919434, "incorrect_loss_raw": 5.272996425628662, "correct_loss_per_char": 0.31191576094854445, "incorrect_loss_per_char": 0.481109593853806, "correct_loss_per_token": 2.183410326639811, "incorrect_loss_per_token": 3.4706429640452066, "correct_loss_uncond": -10.254368782043457, "incorrect_loss_uncond": -10.028520425160727}, "model_output": [{"sum_logits": -5.229840278625488, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.339221954345703, "logits_per_token": -2.614920139312744, "logits_per_char": -0.5229840278625488, "num_chars": 10}, {"sum_logits": -5.004868507385254, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.035669326782227, "logits_per_token": -5.004868507385254, "logits_per_char": -0.4549880461259322, "num_chars": 11}, {"sum_logits": -5.584280490875244, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -18.529659271240234, "logits_per_token": -2.792140245437622, "logits_per_char": -0.465356707572937, "num_chars": 12}, {"sum_logits": -6.550230979919434, "num_tokens": 3, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.80459976196289, "logits_per_token": -2.183410326639811, "logits_per_char": -0.31191576094854445, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1011, "native_id": "Mercury_SC_415719", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.56879711151123, "incorrect_loss_raw": 17.554656346638996, "correct_loss_per_char": 0.34687612170264837, "incorrect_loss_per_char": 0.41796800825330943, "correct_loss_per_token": 1.4568797111511231, "incorrect_loss_per_token": 1.7554656346639, "correct_loss_uncond": -28.697957038879395, "incorrect_loss_uncond": -31.17274284362793}, "model_output": [{"sum_logits": -14.56879711151123, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -43.266754150390625, "logits_per_token": -1.4568797111511231, "logits_per_char": -0.34687612170264837, "num_chars": 42}, {"sum_logits": -19.076662063598633, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -48.87709045410156, "logits_per_token": -1.9076662063598633, "logits_per_char": -0.45420623960949125, "num_chars": 42}, {"sum_logits": -14.096529006958008, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -47.74378967285156, "logits_per_token": -1.4096529006958007, "logits_per_char": -0.33563164302280973, "num_chars": 42}, {"sum_logits": -19.49077796936035, "num_tokens": 10, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -49.561317443847656, "logits_per_token": -1.9490777969360351, "logits_per_char": -0.4640661421276274, "num_chars": 42}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1012, "native_id": "Mercury_SC_407072", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.418128967285156, "incorrect_loss_raw": 14.03373654683431, "correct_loss_per_char": 0.2806042989095052, "incorrect_loss_per_char": 0.44858130812062313, "correct_loss_per_token": 1.4030214945475261, "incorrect_loss_per_token": 2.3389560911390515, "correct_loss_uncond": -24.322952270507812, "incorrect_loss_uncond": -21.978207270304363}, "model_output": [{"sum_logits": -8.418128967285156, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -32.74108123779297, "logits_per_token": -1.4030214945475261, "logits_per_char": -0.2806042989095052, "num_chars": 30}, {"sum_logits": -12.288097381591797, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -35.50357437133789, "logits_per_token": -2.0480162302652993, "logits_per_char": -0.39639023811586444, "num_chars": 31}, {"sum_logits": -15.155593872070312, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -31.84674835205078, "logits_per_token": -2.5259323120117188, "logits_per_char": -0.5051864624023438, "num_chars": 30}, {"sum_logits": -14.65751838684082, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -40.685508728027344, "logits_per_token": -2.4429197311401367, "logits_per_char": -0.4441672238436612, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1013, "native_id": "Mercury_7091823", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.319986343383789, "incorrect_loss_raw": 9.396927038828531, "correct_loss_per_char": 0.6654533039439808, "incorrect_loss_per_char": 0.5734150499148458, "correct_loss_per_token": 3.6599931716918945, "incorrect_loss_per_token": 3.531941387388441, "correct_loss_uncond": -8.703908920288086, "incorrect_loss_uncond": -8.240519046783447}, "model_output": [{"sum_logits": -7.319986343383789, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.023895263671875, "logits_per_token": -3.6599931716918945, "logits_per_char": -0.6654533039439808, "num_chars": 11}, {"sum_logits": -11.880230903625488, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.92151641845703, "logits_per_token": -3.9600769678751626, "logits_per_char": -0.7920153935750326, "num_chars": 15}, {"sum_logits": -7.193382740020752, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -17.871158599853516, "logits_per_token": -3.596691370010376, "logits_per_char": -0.5138130528586251, "num_chars": 14}, {"sum_logits": -9.117167472839355, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -17.11966323852539, "logits_per_token": -3.039055824279785, "logits_per_char": -0.4144167033108798, "num_chars": 22}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1014, "native_id": "Mercury_7040985", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.492531776428223, "incorrect_loss_raw": 15.382926305135092, "correct_loss_per_char": 0.48414161801338196, "incorrect_loss_per_char": 0.5009686043059903, "correct_loss_per_token": 2.2132188252040317, "incorrect_loss_per_token": 2.06121536537453, "correct_loss_uncond": -22.430541038513184, "incorrect_loss_uncond": -17.60624059041341}, "model_output": [{"sum_logits": -11.880044937133789, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -28.0540714263916, "logits_per_token": -2.376008987426758, "logits_per_char": -0.4950018723805745, "num_chars": 24}, {"sum_logits": -15.492531776428223, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -37.923072814941406, "logits_per_token": -2.2132188252040317, "logits_per_char": -0.48414161801338196, "num_chars": 32}, {"sum_logits": -17.373538970947266, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -34.87117004394531, "logits_per_token": -1.9303932189941406, "logits_per_char": -0.5109864403219784, "num_chars": 34}, {"sum_logits": -16.89519500732422, "num_tokens": 9, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -36.042259216308594, "logits_per_token": -1.877243889702691, "logits_per_char": -0.4969175002154182, "num_chars": 34}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1015, "native_id": "Mercury_SC_409383", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.21369743347168, "incorrect_loss_raw": 18.345558484395344, "correct_loss_per_char": 0.33808216518825956, "incorrect_loss_per_char": 0.576786254257733, "correct_loss_per_token": 1.90171217918396, "incorrect_loss_per_token": 2.8825590875413685, "correct_loss_uncond": -12.116411209106445, "incorrect_loss_uncond": -9.893582979838053}, "model_output": [{"sum_logits": -17.5635929107666, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -27.984539031982422, "logits_per_token": -2.927265485127767, "logits_per_char": -0.6505034411395038, "num_chars": 27}, {"sum_logits": -15.41879940032959, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -25.245512008666992, "logits_per_token": -2.5697999000549316, "logits_per_char": -0.467236345464533, "num_chars": 33}, {"sum_logits": -22.054283142089844, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -31.48737335205078, "logits_per_token": -3.1506118774414062, "logits_per_char": -0.6126189761691623, "num_chars": 36}, {"sum_logits": -15.21369743347168, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.330108642578125, "logits_per_token": -1.90171217918396, "logits_per_char": -0.33808216518825956, "num_chars": 45}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1016, "native_id": "Mercury_SC_407080", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.75921630859375, "incorrect_loss_raw": 23.240692138671875, "correct_loss_per_char": 1.031634012858073, "incorrect_loss_per_char": 1.0595145887164978, "correct_loss_per_token": 4.95184326171875, "incorrect_loss_per_token": 6.464416291978623, "correct_loss_uncond": -0.7370452880859375, "incorrect_loss_uncond": -4.122601826985677}, "model_output": [{"sum_logits": -23.33501434326172, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -31.537689208984375, "logits_per_token": -5.83375358581543, "logits_per_char": -1.06068247014826, "num_chars": 22}, {"sum_logits": -22.834304809570312, "num_tokens": 4, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -26.157012939453125, "logits_per_token": -5.708576202392578, "logits_per_char": -0.8782424926757812, "num_chars": 26}, {"sum_logits": -23.552757263183594, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -24.395179748535156, "logits_per_token": -7.850919087727864, "logits_per_char": -1.2396188033254523, "num_chars": 19}, {"sum_logits": -24.75921630859375, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -25.496261596679688, "logits_per_token": -4.95184326171875, "logits_per_char": -1.031634012858073, "num_chars": 24}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1017, "native_id": "MCAS_2000_4_34", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.32305145263672, "incorrect_loss_raw": 19.972448984781902, "correct_loss_per_char": 0.4200663359268852, "incorrect_loss_per_char": 0.37285048057562403, "correct_loss_per_token": 1.932305145263672, "incorrect_loss_per_token": 1.8394763583228704, "correct_loss_uncond": -18.170875549316406, "incorrect_loss_uncond": -21.062650044759113}, "model_output": [{"sum_logits": -19.32305145263672, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -37.493927001953125, "logits_per_token": -1.932305145263672, "logits_per_char": -0.4200663359268852, "num_chars": 46}, {"sum_logits": -12.788414001464844, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -33.6588134765625, "logits_per_token": -1.4209348890516493, "logits_per_char": -0.31191253662109375, "num_chars": 41}, {"sum_logits": -28.704158782958984, "num_tokens": 14, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -45.5532341003418, "logits_per_token": -2.050297055925642, "logits_per_char": -0.38789403760755387, "num_chars": 74}, {"sum_logits": -18.424774169921875, "num_tokens": 9, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -43.89324951171875, "logits_per_token": -2.0471971299913196, "logits_per_char": -0.4187448674982244, "num_chars": 44}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1018, "native_id": "Mercury_7032498", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.670812606811523, "incorrect_loss_raw": 24.772651354471844, "correct_loss_per_char": 0.8414672669910249, "incorrect_loss_per_char": 0.7571003370222481, "correct_loss_per_token": 4.417703151702881, "incorrect_loss_per_token": 3.600704170408703, "correct_loss_uncond": -7.732889175415039, "incorrect_loss_uncond": -7.189923286437988}, "model_output": [{"sum_logits": -38.6092643737793, "num_tokens": 8, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -43.799041748046875, "logits_per_token": -4.826158046722412, "logits_per_char": -0.8978898691576581, "num_chars": 43}, {"sum_logits": -20.40121078491211, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.80105972290039, "logits_per_token": -2.914458683558873, "logits_per_char": -0.7846619532658503, "num_chars": 26}, {"sum_logits": -15.307478904724121, "num_tokens": 5, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -23.287622451782227, "logits_per_token": -3.061495780944824, "logits_per_char": -0.5887491886432354, "num_chars": 26}, {"sum_logits": -17.670812606811523, "num_tokens": 4, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -25.403701782226562, "logits_per_token": -4.417703151702881, "logits_per_char": -0.8414672669910249, "num_chars": 21}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1019, "native_id": "TAKS_2009_5_30", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 10.421113967895508, "incorrect_loss_raw": 8.077119509379068, "correct_loss_per_char": 0.8016241513765775, "incorrect_loss_per_char": 1.0282802279033358, "correct_loss_per_token": 2.605278491973877, "incorrect_loss_per_token": 4.038559754689534, "correct_loss_uncond": -6.4766845703125, "incorrect_loss_uncond": -6.45441468556722}, "model_output": [{"sum_logits": -10.421113967895508, "num_tokens": 4, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -16.897798538208008, "logits_per_token": -2.605278491973877, "logits_per_char": -0.8016241513765775, "num_chars": 13}, {"sum_logits": -6.416389465332031, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -13.314080238342285, "logits_per_token": -3.2081947326660156, "logits_per_char": -0.8020486831665039, "num_chars": 8}, {"sum_logits": -9.555556297302246, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -14.789718627929688, "logits_per_token": -4.777778148651123, "logits_per_char": -1.365079471043178, "num_chars": 7}, {"sum_logits": -8.25941276550293, "num_tokens": 2, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -15.490803718566895, "logits_per_token": -4.129706382751465, "logits_per_char": -0.9177125295003256, "num_chars": 9}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1020, "native_id": "Mercury_SC_415761", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 16.744842529296875, "incorrect_loss_raw": 11.737871487935385, "correct_loss_per_char": 0.3159404250810731, "incorrect_loss_per_char": 0.44846215120264415, "correct_loss_per_token": 1.5222584117542615, "incorrect_loss_per_token": 1.989530070622762, "correct_loss_uncond": -18.571327209472656, "incorrect_loss_uncond": -13.248293240865072}, "model_output": [{"sum_logits": -6.297228813171387, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -20.23843765258789, "logits_per_token": -1.5743072032928467, "logits_per_char": -0.3704252243041992, "num_chars": 17}, {"sum_logits": -10.017629623413086, "num_tokens": 4, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -20.71915626525879, "logits_per_token": -2.5044074058532715, "logits_per_char": -0.5892723307890051, "num_chars": 17}, {"sum_logits": -16.744842529296875, "num_tokens": 11, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -35.31616973876953, "logits_per_token": -1.5222584117542615, "logits_per_char": -0.3159404250810731, "num_chars": 53}, {"sum_logits": -18.89875602722168, "num_tokens": 10, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -34.00090026855469, "logits_per_token": -1.889875602722168, "logits_per_char": -0.38568889851472815, "num_chars": 49}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1021, "native_id": "ACTAAP_2008_5_10", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 18.84393882751465, "incorrect_loss_raw": 15.782620747884115, "correct_loss_per_char": 0.49589312703985916, "incorrect_loss_per_char": 0.46394256437667697, "correct_loss_per_token": 2.355492353439331, "incorrect_loss_per_token": 2.2858937846289744, "correct_loss_uncond": -8.473793029785156, "incorrect_loss_uncond": -12.915865580240885}, "model_output": [{"sum_logits": -8.758901596069336, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -27.486330032348633, "logits_per_token": -1.4598169326782227, "logits_per_char": -0.2654212604869496, "num_chars": 33}, {"sum_logits": -13.781864166259766, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -26.848098754882812, "logits_per_token": -2.2969773610432944, "logits_per_char": -0.4176322474624171, "num_chars": 33}, {"sum_logits": -24.807096481323242, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -31.761030197143555, "logits_per_token": -3.1008870601654053, "logits_per_char": -0.708774185180664, "num_chars": 35}, {"sum_logits": -18.84393882751465, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -27.317731857299805, "logits_per_token": -2.355492353439331, "logits_per_char": -0.49589312703985916, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1022, "native_id": "Mercury_416671", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.457542419433594, "incorrect_loss_raw": 8.979835828145346, "correct_loss_per_char": 0.24250108767778444, "incorrect_loss_per_char": 0.3570408425760643, "correct_loss_per_token": 1.1821928024291992, "incorrect_loss_per_token": 2.108182446161906, "correct_loss_uncond": -16.353351593017578, "incorrect_loss_uncond": -13.716379483540853}, "model_output": [{"sum_logits": -9.842910766601562, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -22.54364013671875, "logits_per_token": -2.4607276916503906, "logits_per_char": -0.46871003650483634, "num_chars": 21}, {"sum_logits": -8.20659065246582, "num_tokens": 5, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -23.968791961669922, "logits_per_token": -1.641318130493164, "logits_per_char": -0.31563810201791614, "num_chars": 26}, {"sum_logits": -8.890006065368652, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -21.576213836669922, "logits_per_token": -2.222501516342163, "logits_per_char": -0.2867743892054404, "num_chars": 31}, {"sum_logits": -9.457542419433594, "num_tokens": 8, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -25.810894012451172, "logits_per_token": -1.1821928024291992, "logits_per_char": -0.24250108767778444, "num_chars": 39}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1023, "native_id": "Mercury_400803", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.759294509887695, "incorrect_loss_raw": 4.600624720255534, "correct_loss_per_char": 0.47592945098876954, "incorrect_loss_per_char": 0.46006247202555334, "correct_loss_per_token": 4.759294509887695, "incorrect_loss_per_token": 4.600624720255534, "correct_loss_uncond": -9.055318832397461, "incorrect_loss_uncond": -9.466527938842773}, "model_output": [{"sum_logits": -4.6167097091674805, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -13.844646453857422, "logits_per_token": -4.6167097091674805, "logits_per_char": -0.461670970916748, "num_chars": 10}, {"sum_logits": -4.682583332061768, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -14.10135269165039, "logits_per_token": -4.682583332061768, "logits_per_char": -0.46825833320617677, "num_chars": 10}, {"sum_logits": -4.5025811195373535, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -14.25545883178711, "logits_per_token": -4.5025811195373535, "logits_per_char": -0.45025811195373533, "num_chars": 10}, {"sum_logits": -4.759294509887695, "num_tokens": 1, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -13.814613342285156, "logits_per_token": -4.759294509887695, "logits_per_char": -0.47592945098876954, "num_chars": 10}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1024, "native_id": "Mercury_7005880", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.80628204345703, "incorrect_loss_raw": 23.125393549601238, "correct_loss_per_char": 0.4620603104712258, "incorrect_loss_per_char": 0.45300084523150136, "correct_loss_per_token": 2.1870854695638022, "incorrect_loss_per_token": 2.183896359530362, "correct_loss_uncond": -10.784675598144531, "incorrect_loss_uncond": -9.540233612060547}, "model_output": [{"sum_logits": -26.923992156982422, "num_tokens": 10, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.94072723388672, "logits_per_token": -2.6923992156982424, "logits_per_char": -0.5728508969570728, "num_chars": 47}, {"sum_logits": -22.040340423583984, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -29.756200790405273, "logits_per_token": -2.0036673112349077, "logits_per_char": -0.4081544522885923, "num_chars": 54}, {"sum_logits": -32.80628204345703, "num_tokens": 15, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -43.59095764160156, "logits_per_token": -2.1870854695638022, "logits_per_char": -0.4620603104712258, "num_chars": 71}, {"sum_logits": -20.411848068237305, "num_tokens": 11, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -33.29995346069336, "logits_per_token": -1.8556225516579368, "logits_per_char": -0.37799718644883895, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1025, "native_id": "Mercury_7210508", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 44.312225341796875, "incorrect_loss_raw": 25.85071563720703, "correct_loss_per_char": 0.671397353663589, "incorrect_loss_per_char": 0.6159420958402556, "correct_loss_per_token": 3.6926854451497397, "incorrect_loss_per_token": 2.76692553385516, "correct_loss_uncond": -16.014801025390625, "incorrect_loss_uncond": -14.575439453125}, "model_output": [{"sum_logits": -22.878662109375, "num_tokens": 7, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -36.03338623046875, "logits_per_token": -3.2683803013392856, "logits_per_char": -0.7889193830818966, "num_chars": 29}, {"sum_logits": -24.1822509765625, "num_tokens": 9, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -37.723426818847656, "logits_per_token": -2.686916775173611, "logits_per_char": -0.5898109994283537, "num_chars": 41}, {"sum_logits": -44.312225341796875, "num_tokens": 12, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -60.3270263671875, "logits_per_token": -3.6926854451497397, "logits_per_char": -0.671397353663589, "num_chars": 66}, {"sum_logits": -30.491233825683594, "num_tokens": 13, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -47.52165222167969, "logits_per_token": -2.345479525052584, "logits_per_char": -0.46909590501051684, "num_chars": 65}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1026, "native_id": "NYSEDREGENTS_2013_4_1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 20.874042510986328, "incorrect_loss_raw": 23.327184041341145, "correct_loss_per_char": 0.7731126855920862, "incorrect_loss_per_char": 0.8240673608460334, "correct_loss_per_token": 2.982006072998047, "incorrect_loss_per_token": 3.744491286504836, "correct_loss_uncond": -8.685340881347656, "incorrect_loss_uncond": -5.745154062906901}, "model_output": [{"sum_logits": -21.631912231445312, "num_tokens": 5, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -25.11768341064453, "logits_per_token": -4.326382446289062, "logits_per_char": -0.721063741048177, "num_chars": 30}, {"sum_logits": -18.42030906677246, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.14816665649414, "logits_per_token": -2.631472723824637, "logits_per_char": -0.6822336691397207, "num_chars": 27}, {"sum_logits": -20.874042510986328, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -29.559383392333984, "logits_per_token": -2.982006072998047, "logits_per_char": -0.7731126855920862, "num_chars": 27}, {"sum_logits": -29.929330825805664, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -36.95116424560547, "logits_per_token": -4.275618689400809, "logits_per_char": -1.0689046723502023, "num_chars": 28}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1027, "native_id": "NYSEDREGENTS_2008_4_12", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.259081840515137, "incorrect_loss_raw": 11.788705825805664, "correct_loss_per_char": 0.6199600800223972, "incorrect_loss_per_char": 0.5732912403798243, "correct_loss_per_token": 2.8518163681030275, "incorrect_loss_per_token": 2.6144999927944608, "correct_loss_uncond": -19.824620246887207, "incorrect_loss_uncond": -17.068578720092773}, "model_output": [{"sum_logits": -4.819208145141602, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -26.57568359375, "logits_per_token": -1.2048020362854004, "logits_per_char": -0.2536425339548211, "num_chars": 19}, {"sum_logits": -9.2852783203125, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -24.426868438720703, "logits_per_token": -3.0950927734375, "logits_per_char": -0.5803298950195312, "num_chars": 16}, {"sum_logits": -14.259081840515137, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -34.083702087402344, "logits_per_token": -2.8518163681030275, "logits_per_char": -0.6199600800223972, "num_chars": 23}, {"sum_logits": -21.26163101196289, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.56930160522461, "logits_per_token": -3.543605168660482, "logits_per_char": -0.8859012921651205, "num_chars": 24}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1028, "native_id": "Mercury_400091", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.973374605178833, "incorrect_loss_raw": 3.1993605693181357, "correct_loss_per_char": 0.7433436512947083, "incorrect_loss_per_char": 0.9079490469561682, "correct_loss_per_token": 1.4866873025894165, "incorrect_loss_per_token": 1.5996802846590679, "correct_loss_uncond": -7.799912214279175, "incorrect_loss_uncond": -6.9474614063898725}, "model_output": [{"sum_logits": -3.891920566558838, "num_tokens": 2, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -10.252274513244629, "logits_per_token": -1.945960283279419, "logits_per_char": -1.2973068555196126, "num_chars": 3}, {"sum_logits": -2.973374605178833, "num_tokens": 2, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -10.773286819458008, "logits_per_token": -1.4866873025894165, "logits_per_char": -0.7433436512947083, "num_chars": 4}, {"sum_logits": -1.9840844869613647, "num_tokens": 2, "num_tokens_all": 213, "is_greedy": true, "sum_logits_uncond": -9.886903762817383, "logits_per_token": -0.9920422434806824, "logits_per_char": -0.4960211217403412, "num_chars": 4}, {"sum_logits": -3.722076654434204, "num_tokens": 2, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -10.301287651062012, "logits_per_token": -1.861038327217102, "logits_per_char": -0.930519163608551, "num_chars": 4}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1029, "native_id": "Mercury_SC_402257", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 2.7745301723480225, "incorrect_loss_raw": 3.262697458267212, "correct_loss_per_char": 0.5549060344696045, "incorrect_loss_per_char": 0.543782909711202, "correct_loss_per_token": 2.7745301723480225, "incorrect_loss_per_token": 3.262697458267212, "correct_loss_uncond": -9.29880166053772, "incorrect_loss_uncond": -8.344040950139364}, "model_output": [{"sum_logits": -2.3092195987701416, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": true, "sum_logits_uncond": -12.399616241455078, "logits_per_token": -2.3092195987701416, "logits_per_char": -0.38486993312835693, "num_chars": 6}, {"sum_logits": -4.110267639160156, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -12.531604766845703, "logits_per_token": -4.110267639160156, "logits_per_char": -0.6850446065266927, "num_chars": 6}, {"sum_logits": -3.368605136871338, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -3.368605136871338, "logits_per_char": -0.5614341894785563, "num_chars": 6}, {"sum_logits": -2.7745301723480225, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -12.073331832885742, "logits_per_token": -2.7745301723480225, "logits_per_char": -0.5549060344696045, "num_chars": 5}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1030, "native_id": "Mercury_7227815", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.203147888183594, "incorrect_loss_raw": 23.783091862996418, "correct_loss_per_char": 0.6213435720890126, "incorrect_loss_per_char": 0.5066244338160971, "correct_loss_per_token": 3.650393486022949, "incorrect_loss_per_token": 2.3783091862996417, "correct_loss_uncond": -8.281364440917969, "incorrect_loss_uncond": -12.489278793334961}, "model_output": [{"sum_logits": -30.36438751220703, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -42.339439392089844, "logits_per_token": -3.036438751220703, "logits_per_char": -0.6900997161865234, "num_chars": 44}, {"sum_logits": -20.649507522583008, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -34.91887283325195, "logits_per_token": -2.0649507522583006, "logits_per_char": -0.39710591389582706, "num_chars": 52}, {"sum_logits": -20.33538055419922, "num_tokens": 10, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.558799743652344, "logits_per_token": -2.033538055419922, "logits_per_char": -0.4326676713659408, "num_chars": 47}, {"sum_logits": -29.203147888183594, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.48451232910156, "logits_per_token": -3.650393486022949, "logits_per_char": -0.6213435720890126, "num_chars": 47}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1031, "native_id": "ACTAAP_2010_7_3", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 20.553544998168945, "incorrect_loss_raw": 27.05318768819173, "correct_loss_per_char": 0.6045160293579102, "incorrect_loss_per_char": 0.5762082563660105, "correct_loss_per_token": 2.569193124771118, "incorrect_loss_per_token": 2.841868543624878, "correct_loss_uncond": -8.580968856811523, "incorrect_loss_uncond": -6.658974329630534}, "model_output": [{"sum_logits": -20.553544998168945, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -29.13451385498047, "logits_per_token": -2.569193124771118, "logits_per_char": -0.6045160293579102, "num_chars": 34}, {"sum_logits": -16.38597297668457, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -25.312164306640625, "logits_per_token": -2.0482466220855713, "logits_per_char": -0.4201531532483223, "num_chars": 39}, {"sum_logits": -31.849544525146484, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -37.35113525390625, "logits_per_token": -3.1849544525146483, "logits_per_char": -0.6499907045948262, "num_chars": 49}, {"sum_logits": -32.92404556274414, "num_tokens": 10, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -38.47318649291992, "logits_per_token": -3.292404556274414, "logits_per_char": -0.6584809112548828, "num_chars": 50}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1032, "native_id": "Mercury_SC_410905", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 17.332490921020508, "incorrect_loss_raw": 29.148915608723957, "correct_loss_per_char": 0.49521402631487166, "incorrect_loss_per_char": 0.7229947529141865, "correct_loss_per_token": 2.476070131574358, "incorrect_loss_per_token": 4.422862824939546, "correct_loss_uncond": -10.372949600219727, "incorrect_loss_uncond": -5.138529459635417}, "model_output": [{"sum_logits": -28.578968048095703, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -32.69260787963867, "logits_per_token": -4.082709721156529, "logits_per_char": -0.6804516201927548, "num_chars": 42}, {"sum_logits": -32.60023498535156, "num_tokens": 6, "num_tokens_all": 227, "is_greedy": false, "sum_logits_uncond": -35.557891845703125, "logits_per_token": -5.433372497558594, "logits_per_char": -0.815005874633789, "num_chars": 40}, {"sum_logits": -26.26754379272461, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -34.61183547973633, "logits_per_token": -3.7525062561035156, "logits_per_char": -0.6735267639160156, "num_chars": 39}, {"sum_logits": -17.332490921020508, "num_tokens": 7, "num_tokens_all": 228, "is_greedy": false, "sum_logits_uncond": -27.705440521240234, "logits_per_token": -2.476070131574358, "logits_per_char": -0.49521402631487166, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1033, "native_id": "OHAT_2010_5_18", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.541126251220703, "incorrect_loss_raw": 12.738421122233072, "correct_loss_per_char": 0.4709432197339607, "incorrect_loss_per_char": 0.44487973681560017, "correct_loss_per_token": 2.220160893031529, "incorrect_loss_per_token": 2.0050652519104974, "correct_loss_uncond": -20.454883575439453, "incorrect_loss_uncond": -16.229618072509766}, "model_output": [{"sum_logits": -12.554010391235352, "num_tokens": 6, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -28.261783599853516, "logits_per_token": -2.092335065205892, "logits_per_char": -0.482846553509052, "num_chars": 26}, {"sum_logits": -10.792631149291992, "num_tokens": 6, "num_tokens_all": 235, "is_greedy": false, "sum_logits_uncond": -24.625782012939453, "logits_per_token": -1.798771858215332, "logits_per_char": -0.3721596948031721, "num_chars": 29}, {"sum_logits": -14.868621826171875, "num_tokens": 7, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -34.01655197143555, "logits_per_token": -2.124088832310268, "logits_per_char": -0.4796329621345766, "num_chars": 31}, {"sum_logits": -15.541126251220703, "num_tokens": 7, "num_tokens_all": 236, "is_greedy": false, "sum_logits_uncond": -35.996009826660156, "logits_per_token": -2.220160893031529, "logits_per_char": -0.4709432197339607, "num_chars": 33}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1034, "native_id": "NAEP_2000_8_S11+10", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.4332737922668457, "incorrect_loss_raw": 11.040923436482748, "correct_loss_per_char": 0.3814748658074273, "incorrect_loss_per_char": 0.8250478190234584, "correct_loss_per_token": 1.7166368961334229, "incorrect_loss_per_token": 4.084521696302626, "correct_loss_uncond": -10.95907735824585, "incorrect_loss_uncond": -9.531964619954428}, "model_output": [{"sum_logits": -3.4332737922668457, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.392351150512695, "logits_per_token": -1.7166368961334229, "logits_per_char": -0.3814748658074273, "num_chars": 9}, {"sum_logits": -12.7255859375, "num_tokens": 3, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -19.142040252685547, "logits_per_token": -4.241861979166667, "logits_per_char": -1.0604654947916667, "num_chars": 12}, {"sum_logits": -15.481851577758789, "num_tokens": 5, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -32.68762969970703, "logits_per_token": -3.096370315551758, "logits_per_char": -0.5954558299137995, "num_chars": 26}, {"sum_logits": -4.915332794189453, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -4.915332794189453, "logits_per_char": -0.8192221323649088, "num_chars": 6}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1035, "native_id": "MCAS_2003_8_29", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 23.66167640686035, "incorrect_loss_raw": 13.108805338541666, "correct_loss_per_char": 0.788722546895345, "incorrect_loss_per_char": 0.498092281076061, "correct_loss_per_token": 3.943612734476725, "incorrect_loss_per_token": 2.3876144409179685, "correct_loss_uncond": -14.63560676574707, "incorrect_loss_uncond": -19.793010075887043}, "model_output": [{"sum_logits": -21.073196411132812, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -38.18822479248047, "logits_per_token": -3.5121994018554688, "logits_per_char": -0.7526141575404576, "num_chars": 28}, {"sum_logits": -23.66167640686035, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -38.29728317260742, "logits_per_token": -3.943612734476725, "logits_per_char": -0.788722546895345, "num_chars": 30}, {"sum_logits": -12.588163375854492, "num_tokens": 5, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -35.118309020996094, "logits_per_token": -2.5176326751708986, "logits_per_char": -0.4841601298405574, "num_chars": 26}, {"sum_logits": -5.665056228637695, "num_tokens": 5, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -25.39891242980957, "logits_per_token": -1.133011245727539, "logits_per_char": -0.25750255584716797, "num_chars": 22}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1036, "native_id": "Mercury_401433", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.074731826782227, "incorrect_loss_raw": 10.34890619913737, "correct_loss_per_char": 1.674970202975803, "incorrect_loss_per_char": 1.902709994997297, "correct_loss_per_token": 3.0149463653564452, "incorrect_loss_per_token": 3.6617833773295083, "correct_loss_uncond": -14.05744743347168, "incorrect_loss_uncond": -9.161452611287435}, "model_output": [{"sum_logits": -18.152036666870117, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.776409149169922, "logits_per_token": -4.538009166717529, "logits_per_char": -2.5931480952671597, "num_chars": 7}, {"sum_logits": -10.72091007232666, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.86349105834961, "logits_per_token": -5.36045503616333, "logits_per_char": -2.680227518081665, "num_chars": 4}, {"sum_logits": -2.173771858215332, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": true, "sum_logits_uncond": -16.891176223754883, "logits_per_token": -1.086885929107666, "logits_per_char": -0.4347543716430664, "num_chars": 5}, {"sum_logits": -15.074731826782227, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -29.132179260253906, "logits_per_token": -3.0149463653564452, "logits_per_char": -1.674970202975803, "num_chars": 9}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1037, "native_id": "TIMSS_1995_8_N4", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.440483570098877, "incorrect_loss_raw": 6.346510887145996, "correct_loss_per_char": 0.6044981744554307, "incorrect_loss_per_char": 0.9337310223352342, "correct_loss_per_token": 5.440483570098877, "incorrect_loss_per_token": 6.346510887145996, "correct_loss_uncond": -8.093289852142334, "incorrect_loss_uncond": -5.50289249420166}, "model_output": [{"sum_logits": -8.385772705078125, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -12.445281982421875, "logits_per_token": -8.385772705078125, "logits_per_char": -1.197967529296875, "num_chars": 7}, {"sum_logits": -5.440483570098877, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -13.533773422241211, "logits_per_token": -5.440483570098877, "logits_per_char": -0.6044981744554307, "num_chars": 9}, {"sum_logits": -4.508726119995117, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -12.686746597290039, "logits_per_token": -4.508726119995117, "logits_per_char": -0.5635907649993896, "num_chars": 8}, {"sum_logits": -4.559449672698975, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -4.559449672698975, "logits_per_char": -0.6513499532427106, "num_chars": 7}, {"sum_logits": -7.932095050811768, "num_tokens": 1, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -7.932095050811768, "logits_per_char": -1.3220158418019612, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1038, "native_id": "Mercury_SC_405885", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.922317504882812, "incorrect_loss_raw": 15.684519449869791, "correct_loss_per_char": 0.35233282273815525, "incorrect_loss_per_char": 0.5671321049293093, "correct_loss_per_token": 1.820386250813802, "incorrect_loss_per_token": 3.253502941131592, "correct_loss_uncond": -19.384458541870117, "incorrect_loss_uncond": -13.709203720092773}, "model_output": [{"sum_logits": -15.329275131225586, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -29.21107292175293, "logits_per_token": -2.5548791885375977, "logits_per_char": -0.589587505047138, "num_chars": 26}, {"sum_logits": -17.2154598236084, "num_tokens": 4, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -28.888504028320312, "logits_per_token": -4.3038649559021, "logits_per_char": -0.5936365456416689, "num_chars": 29}, {"sum_logits": -10.922317504882812, "num_tokens": 6, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -30.30677604675293, "logits_per_token": -1.820386250813802, "logits_per_char": -0.35233282273815525, "num_chars": 31}, {"sum_logits": -14.50882339477539, "num_tokens": 5, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -30.081592559814453, "logits_per_token": -2.901764678955078, "logits_per_char": -0.5181722640991211, "num_chars": 28}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1039, "native_id": "Mercury_7263638", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.984004974365234, "incorrect_loss_raw": 21.96401532491048, "correct_loss_per_char": 0.5307501554489136, "incorrect_loss_per_char": 0.622976751992562, "correct_loss_per_token": 2.830667495727539, "incorrect_loss_per_token": 3.637227321806408, "correct_loss_uncond": -11.647003173828125, "incorrect_loss_uncond": -8.344677607218424}, "model_output": [{"sum_logits": -18.11125946044922, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -33.005043029785156, "logits_per_token": -3.018543243408203, "logits_per_char": -0.5842341761435231, "num_chars": 31}, {"sum_logits": -16.984004974365234, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.63100814819336, "logits_per_token": -2.830667495727539, "logits_per_char": -0.5307501554489136, "num_chars": 32}, {"sum_logits": -18.677961349487305, "num_tokens": 5, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -26.549304962158203, "logits_per_token": -3.735592269897461, "logits_per_char": -0.5188322597079806, "num_chars": 36}, {"sum_logits": -29.102825164794922, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.37173080444336, "logits_per_token": -4.15754645211356, "logits_per_char": -0.7658638201261821, "num_chars": 38}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1040, "native_id": "Mercury_401428", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 39.497615814208984, "incorrect_loss_raw": 38.34672164916992, "correct_loss_per_char": 0.7744630551805683, "incorrect_loss_per_char": 0.7759529155290047, "correct_loss_per_token": 3.5906923467462715, "incorrect_loss_per_token": 4.113633052611248, "correct_loss_uncond": -8.846843719482422, "incorrect_loss_uncond": -9.015309651692709}, "model_output": [{"sum_logits": -36.4167594909668, "num_tokens": 10, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -44.375389099121094, "logits_per_token": -3.6416759490966797, "logits_per_char": -0.7431991732850367, "num_chars": 49}, {"sum_logits": -39.497615814208984, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -48.344459533691406, "logits_per_token": -3.5906923467462715, "logits_per_char": -0.7744630551805683, "num_chars": 51}, {"sum_logits": -29.86908721923828, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -37.72799301147461, "logits_per_token": -4.267012459891183, "logits_per_char": -0.6095732085558833, "num_chars": 49}, {"sum_logits": -48.75431823730469, "num_tokens": 11, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -59.98271179199219, "logits_per_token": -4.432210748845881, "logits_per_char": -0.9750863647460938, "num_chars": 50}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1041, "native_id": "Mercury_SC_402121", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 12.648122787475586, "incorrect_loss_raw": 16.580581665039062, "correct_loss_per_char": 0.30114578065418063, "incorrect_loss_per_char": 0.3453977407816968, "correct_loss_per_token": 1.8068746839250838, "incorrect_loss_per_token": 2.292185579027448, "correct_loss_uncond": -13.293092727661133, "incorrect_loss_uncond": -17.770719528198242}, "model_output": [{"sum_logits": -12.846782684326172, "num_tokens": 8, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -23.3593807220459, "logits_per_token": -1.6058478355407715, "logits_per_char": -0.3058757781982422, "num_chars": 42}, {"sum_logits": -12.648122787475586, "num_tokens": 7, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -25.94121551513672, "logits_per_token": -1.8068746839250838, "logits_per_char": -0.30114578065418063, "num_chars": 42}, {"sum_logits": -17.56136703491211, "num_tokens": 7, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -41.32651901245117, "logits_per_token": -2.5087667192731584, "logits_per_char": -0.3512273406982422, "num_chars": 50}, {"sum_logits": -19.333595275878906, "num_tokens": 7, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -38.368003845214844, "logits_per_token": -2.7619421822684154, "logits_per_char": -0.379090103448606, "num_chars": 51}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1042, "native_id": "NYSEDREGENTS_2015_4_7", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 0.8571354150772095, "incorrect_loss_raw": 3.2379595041275024, "correct_loss_per_char": 0.09523726834191217, "incorrect_loss_per_char": 0.23751833372645909, "correct_loss_per_token": 0.8571354150772095, "incorrect_loss_per_token": 1.5613202651341758, "correct_loss_uncond": -11.55843198299408, "incorrect_loss_uncond": -10.597278952598572}, "model_output": [{"sum_logits": -0.8571354150772095, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": true, "sum_logits_uncond": -12.415567398071289, "logits_per_token": -0.8571354150772095, "logits_per_char": -0.09523726834191217, "num_chars": 9}, {"sum_logits": -1.4460012912750244, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -11.784217834472656, "logits_per_token": -1.4460012912750244, "logits_per_char": -0.1606668101416694, "num_chars": 9}, {"sum_logits": -5.0299177169799805, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -15.886259078979492, "logits_per_token": -1.676639238993327, "logits_per_char": -0.3143698573112488, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1043, "native_id": "MCAS_2012_5_23614", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 8.365734100341797, "incorrect_loss_raw": 15.282275517781576, "correct_loss_per_char": 0.26142919063568115, "incorrect_loss_per_char": 0.44608552720811634, "correct_loss_per_token": 1.3942890167236328, "incorrect_loss_per_token": 2.388153878469316, "correct_loss_uncond": -20.657760620117188, "incorrect_loss_uncond": -17.272872924804688}, "model_output": [{"sum_logits": -20.020397186279297, "num_tokens": 7, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.547245025634766, "logits_per_token": -2.8600567408970425, "logits_per_char": -0.5561221440633138, "num_chars": 36}, {"sum_logits": -16.536773681640625, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -31.062204360961914, "logits_per_token": -2.756128946940104, "logits_per_char": -0.472479248046875, "num_chars": 35}, {"sum_logits": -8.365734100341797, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -29.023494720458984, "logits_per_token": -1.3942890167236328, "logits_per_char": -0.26142919063568115, "num_chars": 32}, {"sum_logits": -9.289655685424805, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -33.05599594116211, "logits_per_token": -1.5482759475708008, "logits_per_char": -0.30965518951416016, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1044, "native_id": "Mercury_407262", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 27.137557983398438, "incorrect_loss_raw": 37.22844696044922, "correct_loss_per_char": 0.798163470099954, "incorrect_loss_per_char": 0.957429798250276, "correct_loss_per_token": 3.876793997628348, "incorrect_loss_per_token": 5.59878406827412, "correct_loss_uncond": -7.739505767822266, "incorrect_loss_uncond": -5.364968617757161}, "model_output": [{"sum_logits": -27.137557983398438, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -34.8770637512207, "logits_per_token": -3.876793997628348, "logits_per_char": -0.798163470099954, "num_chars": 34}, {"sum_logits": -39.73807144165039, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -42.374603271484375, "logits_per_token": -5.676867348807199, "logits_per_char": -0.9934517860412597, "num_chars": 40}, {"sum_logits": -36.61252212524414, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -48.21402359008789, "logits_per_token": -5.2303603036063055, "logits_per_char": -1.017014503479004, "num_chars": 36}, {"sum_logits": -35.334747314453125, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -37.191619873046875, "logits_per_token": -5.8891245524088545, "logits_per_char": -0.861823105230564, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1045, "native_id": "MCAS_2014_8_6", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 36.78092956542969, "incorrect_loss_raw": 38.1880989074707, "correct_loss_per_char": 0.5330569502236187, "incorrect_loss_per_char": 0.5732685589979954, "correct_loss_per_token": 2.452061971028646, "incorrect_loss_per_token": 3.01201308486808, "correct_loss_uncond": -16.729198455810547, "incorrect_loss_uncond": -12.991820017496744}, "model_output": [{"sum_logits": -34.850563049316406, "num_tokens": 12, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -45.659332275390625, "logits_per_token": -2.9042135874430337, "logits_per_char": -0.5713207057264985, "num_chars": 61}, {"sum_logits": -36.78092956542969, "num_tokens": 15, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -53.510128021240234, "logits_per_token": -2.452061971028646, "logits_per_char": -0.5330569502236187, "num_chars": 69}, {"sum_logits": -43.51177215576172, "num_tokens": 13, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -56.24937438964844, "logits_per_token": -3.3470593965970554, "logits_per_char": -0.6592692750872988, "num_chars": 66}, {"sum_logits": -36.201961517333984, "num_tokens": 13, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -51.63105010986328, "logits_per_token": -2.7847662705641527, "logits_per_char": -0.48921569618018895, "num_chars": 74}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1046, "native_id": "Mercury_7032515", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 10.91868782043457, "incorrect_loss_raw": 10.740111351013184, "correct_loss_per_char": 0.6824179887771606, "incorrect_loss_per_char": 1.0294577047881053, "correct_loss_per_token": 2.183737564086914, "incorrect_loss_per_token": 4.183404816521539, "correct_loss_uncond": -10.424236297607422, "incorrect_loss_uncond": -5.835453033447266}, "model_output": [{"sum_logits": -10.860618591308594, "num_tokens": 2, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.29796314239502, "logits_per_token": -5.430309295654297, "logits_per_char": -0.9873289628462358, "num_chars": 11}, {"sum_logits": -11.558443069458008, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -16.685619354248047, "logits_per_token": -3.8528143564860025, "logits_per_char": -1.284271452162001, "num_chars": 9}, {"sum_logits": -9.80127239227295, "num_tokens": 3, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -17.74311065673828, "logits_per_token": -3.2670907974243164, "logits_per_char": -0.8167726993560791, "num_chars": 12}, {"sum_logits": -10.91868782043457, "num_tokens": 5, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -21.342924118041992, "logits_per_token": -2.183737564086914, "logits_per_char": -0.6824179887771606, "num_chars": 16}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1047, "native_id": "Mercury_7270165", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.209102630615234, "incorrect_loss_raw": 12.969112714131674, "correct_loss_per_char": 0.16444826126098633, "incorrect_loss_per_char": 0.23527271390831248, "correct_loss_per_token": 0.7674252192179362, "incorrect_loss_per_token": 1.1906058899079912, "correct_loss_uncond": -14.133516311645508, "incorrect_loss_uncond": -11.825210889180502}, "model_output": [{"sum_logits": -11.571064949035645, "num_tokens": 11, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -24.61472511291504, "logits_per_token": -1.0519149953668767, "logits_per_char": -0.2030011394567657, "num_chars": 57}, {"sum_logits": -14.512794494628906, "num_tokens": 10, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -25.8717041015625, "logits_per_token": -1.4512794494628907, "logits_per_char": -0.2738263112194133, "num_chars": 53}, {"sum_logits": -9.209102630615234, "num_tokens": 12, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -23.342618942260742, "logits_per_token": -0.7674252192179362, "logits_per_char": -0.16444826126098633, "num_chars": 56}, {"sum_logits": -12.823478698730469, "num_tokens": 12, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -23.896541595458984, "logits_per_token": -1.0686232248942058, "logits_per_char": -0.22899069104875838, "num_chars": 56}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1048, "native_id": "Mercury_7017045", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 5.020405292510986, "incorrect_loss_raw": 7.457711060841878, "correct_loss_per_char": 0.3861850225008451, "incorrect_loss_per_char": 0.4717609890398548, "correct_loss_per_token": 1.6734684308369954, "incorrect_loss_per_token": 2.485903686947293, "correct_loss_uncond": -12.828423976898193, "incorrect_loss_uncond": -14.310314019521078}, "model_output": [{"sum_logits": -6.6831231117248535, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -20.901451110839844, "logits_per_token": -2.2277077039082847, "logits_per_char": -0.5140863932096041, "num_chars": 13}, {"sum_logits": -5.020405292510986, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -17.84882926940918, "logits_per_token": -1.6734684308369954, "logits_per_char": -0.3861850225008451, "num_chars": 13}, {"sum_logits": -7.641199111938477, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.026491165161133, "logits_per_token": -2.5470663706461587, "logits_per_char": -0.4775749444961548, "num_chars": 16}, {"sum_logits": -8.048810958862305, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -22.37613296508789, "logits_per_token": -2.682936986287435, "logits_per_char": -0.4236216294138055, "num_chars": 19}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1049, "native_id": "Mercury_SC_400386", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 23.29787826538086, "incorrect_loss_raw": 21.188905080159504, "correct_loss_per_char": 0.862884380199291, "incorrect_loss_per_char": 0.8106044924393561, "correct_loss_per_token": 3.3282683236258372, "incorrect_loss_per_token": 3.3639270540267696, "correct_loss_uncond": -8.705337524414062, "incorrect_loss_uncond": -5.189261118570964}, "model_output": [{"sum_logits": -23.29787826538086, "num_tokens": 7, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -32.00321578979492, "logits_per_token": -3.3282683236258372, "logits_per_char": -0.862884380199291, "num_chars": 27}, {"sum_logits": -17.21379280090332, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -23.55771827697754, "logits_per_token": -2.86896546681722, "logits_per_char": -0.7484257739523182, "num_chars": 23}, {"sum_logits": -21.112197875976562, "num_tokens": 7, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -28.614336013793945, "logits_per_token": -3.016028267996652, "logits_per_char": -0.7819332546657987, "num_chars": 27}, {"sum_logits": -25.240724563598633, "num_tokens": 6, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -26.962444305419922, "logits_per_token": -4.2067874272664385, "logits_per_char": -0.9014544486999512, "num_chars": 28}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1050, "native_id": "Mercury_400750", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.194721221923828, "incorrect_loss_raw": 3.4392776489257812, "correct_loss_per_char": 2.097360610961914, "incorrect_loss_per_char": 1.7196388244628906, "correct_loss_per_token": 4.194721221923828, "incorrect_loss_per_token": 3.4392776489257812, "correct_loss_uncond": -2.621243953704834, "incorrect_loss_uncond": -2.721587657928467}, "model_output": [{"sum_logits": -3.16202449798584, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -5.77780818939209, "logits_per_token": -3.16202449798584, "logits_per_char": -1.58101224899292, "num_chars": 2}, {"sum_logits": -4.382983684539795, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -6.237893581390381, "logits_per_token": -4.382983684539795, "logits_per_char": -2.1914918422698975, "num_chars": 2}, {"sum_logits": -2.772824764251709, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -6.466894149780273, "logits_per_token": -2.772824764251709, "logits_per_char": -1.3864123821258545, "num_chars": 2}, {"sum_logits": -4.194721221923828, "num_tokens": 1, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -6.815965175628662, "logits_per_token": -4.194721221923828, "logits_per_char": -2.097360610961914, "num_chars": 2}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1051, "native_id": "MCAS_2006_9_28-v1", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 33.58848190307617, "incorrect_loss_raw": 32.86306381225586, "correct_loss_per_char": 0.44784642537434893, "incorrect_loss_per_char": 0.4823449566630738, "correct_loss_per_token": 2.0992801189422607, "incorrect_loss_per_token": 2.4704297240506348, "correct_loss_uncond": -16.214275360107422, "incorrect_loss_uncond": -17.699289957682293}, "model_output": [{"sum_logits": -28.235614776611328, "num_tokens": 11, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -45.03879928588867, "logits_per_token": -2.56687407060103, "logits_per_char": -0.47059357961018883, "num_chars": 60}, {"sum_logits": -32.377098083496094, "num_tokens": 14, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -48.12326431274414, "logits_per_token": -2.3126498631068637, "logits_per_char": -0.4832402699029268, "num_chars": 67}, {"sum_logits": -33.58848190307617, "num_tokens": 16, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -49.802757263183594, "logits_per_token": -2.0992801189422607, "logits_per_char": -0.44784642537434893, "num_chars": 75}, {"sum_logits": -37.976478576660156, "num_tokens": 15, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -58.52499771118164, "logits_per_token": -2.5317652384440104, "logits_per_char": -0.4932010204761059, "num_chars": 77}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1052, "native_id": "Mercury_416376", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 7.811324119567871, "incorrect_loss_raw": 6.217747529347737, "correct_loss_per_char": 0.8679249021742079, "incorrect_loss_per_char": 0.6676087732668275, "correct_loss_per_token": 7.811324119567871, "incorrect_loss_per_token": 3.527410904566447, "correct_loss_uncond": -4.682721138000488, "incorrect_loss_uncond": -8.372185866038004}, "model_output": [{"sum_logits": -2.5112228393554688, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.540550231933594, "logits_per_token": -2.5112228393554688, "logits_per_char": -0.20926856994628906, "num_chars": 12}, {"sum_logits": -5.552131175994873, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.723690032958984, "logits_per_token": -2.7760655879974365, "logits_per_char": -0.6169034639994303, "num_chars": 9}, {"sum_logits": -10.589888572692871, "num_tokens": 2, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -15.505559921264648, "logits_per_token": -5.2949442863464355, "logits_per_char": -1.1766542858547635, "num_chars": 9}, {"sum_logits": -7.811324119567871, "num_tokens": 1, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -12.49404525756836, "logits_per_token": -7.811324119567871, "logits_per_char": -0.8679249021742079, "num_chars": 9}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1053, "native_id": "Mercury_7086520", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 6.594603538513184, "incorrect_loss_raw": 11.684514999389648, "correct_loss_per_char": 0.2997547062960538, "incorrect_loss_per_char": 0.4263596740188043, "correct_loss_per_token": 1.0991005897521973, "incorrect_loss_per_token": 1.862829087272523, "correct_loss_uncond": -19.067769050598145, "incorrect_loss_uncond": -17.06454849243164}, "model_output": [{"sum_logits": -6.594603538513184, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -25.662372589111328, "logits_per_token": -1.0991005897521973, "logits_per_char": -0.2997547062960538, "num_chars": 22}, {"sum_logits": -8.161954879760742, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -27.904634475708008, "logits_per_token": -1.360325813293457, "logits_per_char": -0.35486760346785834, "num_chars": 23}, {"sum_logits": -16.233240127563477, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -30.00767707824707, "logits_per_token": -2.7055400212605796, "logits_per_char": -0.6012311158356843, "num_chars": 27}, {"sum_logits": -10.658349990844727, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.33487892150879, "logits_per_token": -1.5226214272635323, "logits_per_char": -0.3229803027528705, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1054, "native_id": "Mercury_7014333", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.224239349365234, "incorrect_loss_raw": 12.043893178304037, "correct_loss_per_char": 0.9294763044877485, "incorrect_loss_per_char": 0.8614915173004429, "correct_loss_per_token": 3.4080797831217446, "incorrect_loss_per_token": 3.668850395414564, "correct_loss_uncond": -6.304409027099609, "incorrect_loss_uncond": -6.706991831461589}, "model_output": [{"sum_logits": -10.224239349365234, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -16.528648376464844, "logits_per_token": -3.4080797831217446, "logits_per_char": -0.9294763044877485, "num_chars": 11}, {"sum_logits": -10.329768180847168, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -18.030113220214844, "logits_per_token": -3.443256060282389, "logits_per_char": -0.9390698346224698, "num_chars": 11}, {"sum_logits": -13.35380744934082, "num_tokens": 3, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.179340362548828, "logits_per_token": -4.451269149780273, "logits_per_char": -0.95384338923863, "num_chars": 14}, {"sum_logits": -12.448103904724121, "num_tokens": 4, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -21.043201446533203, "logits_per_token": -3.1120259761810303, "logits_per_char": -0.691561328040229, "num_chars": 18}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1055, "native_id": "Mercury_SC_406623", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.05820083618164, "incorrect_loss_raw": 19.027090072631836, "correct_loss_per_char": 0.35852859133765813, "incorrect_loss_per_char": 0.5058645445203024, "correct_loss_per_token": 2.151171548025949, "incorrect_loss_per_token": 2.6161599159240723, "correct_loss_uncond": -23.173137664794922, "incorrect_loss_uncond": -18.21367009480794}, "model_output": [{"sum_logits": -15.05820083618164, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -38.23133850097656, "logits_per_token": -2.151171548025949, "logits_per_char": -0.35852859133765813, "num_chars": 42}, {"sum_logits": -17.135295867919922, "num_tokens": 8, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -38.6187858581543, "logits_per_token": -2.1419119834899902, "logits_per_char": -0.40798323495047434, "num_chars": 42}, {"sum_logits": -21.350740432739258, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -42.454803466796875, "logits_per_token": -3.050105776105608, "logits_per_char": -0.593076123131646, "num_chars": 36}, {"sum_logits": -18.595233917236328, "num_tokens": 7, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -30.648691177368164, "logits_per_token": -2.6564619881766185, "logits_per_char": -0.5165342754787869, "num_chars": 36}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1056, "native_id": "Mercury_7042648", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.539024353027344, "incorrect_loss_raw": 7.795979181925456, "correct_loss_per_char": 0.567378044128418, "incorrect_loss_per_char": 0.6724068941893401, "correct_loss_per_token": 4.539024353027344, "incorrect_loss_per_token": 5.774517854054769, "correct_loss_uncond": -9.478899002075195, "incorrect_loss_uncond": -7.348766009012858}, "model_output": [{"sum_logits": -4.539024353027344, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.017923355102539, "logits_per_token": -4.539024353027344, "logits_per_char": -0.567378044128418, "num_chars": 8}, {"sum_logits": -2.456131935119629, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -15.023412704467773, "logits_per_token": -2.456131935119629, "logits_per_char": -0.27290354834662545, "num_chars": 9}, {"sum_logits": -12.128767967224121, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -15.73462200164795, "logits_per_token": -6.0643839836120605, "logits_per_char": -1.0107306639353435, "num_chars": 12}, {"sum_logits": -8.803037643432617, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -14.676200866699219, "logits_per_token": -8.803037643432617, "logits_per_char": -0.7335864702860514, "num_chars": 12}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1057, "native_id": "MCAS_2004_8_23", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.148427963256836, "incorrect_loss_raw": 17.105236371358234, "correct_loss_per_char": 1.5589479966597124, "incorrect_loss_per_char": 1.5015390087859801, "correct_loss_per_token": 3.429685592651367, "incorrect_loss_per_token": 3.2570810635884606, "correct_loss_uncond": -15.010229110717773, "incorrect_loss_uncond": -16.491809527079266}, "model_output": [{"sum_logits": -15.379688262939453, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -31.450847625732422, "logits_per_token": -3.0759376525878905, "logits_per_char": -1.3981534784490413, "num_chars": 11}, {"sum_logits": -17.148427963256836, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -32.15865707397461, "logits_per_token": -3.429685592651367, "logits_per_char": -1.5589479966597124, "num_chars": 11}, {"sum_logits": -14.756958961486816, "num_tokens": 6, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -33.26573944091797, "logits_per_token": -2.4594931602478027, "logits_per_char": -1.3415417237715288, "num_chars": 11}, {"sum_logits": -21.179061889648438, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -36.07455062866211, "logits_per_token": -4.235812377929688, "logits_per_char": -1.7649218241373699, "num_chars": 12}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1058, "native_id": "MCAS_2013_8_29425", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.92422103881836, "incorrect_loss_raw": 18.597925821940105, "correct_loss_per_char": 0.40831335996970153, "incorrect_loss_per_char": 0.44599557706541176, "correct_loss_per_token": 1.76935789320204, "incorrect_loss_per_token": 2.066436202437789, "correct_loss_uncond": -12.941917419433594, "incorrect_loss_uncond": -11.951621373494467}, "model_output": [{"sum_logits": -15.92422103881836, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -28.866138458251953, "logits_per_token": -1.76935789320204, "logits_per_char": -0.40831335996970153, "num_chars": 39}, {"sum_logits": -19.728023529052734, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -31.21287727355957, "logits_per_token": -2.192002614339193, "logits_per_char": -0.5058467571551983, "num_chars": 39}, {"sum_logits": -14.150585174560547, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -25.9534912109375, "logits_per_token": -1.5722872416178386, "logits_per_char": -0.34513622376976943, "num_chars": 41}, {"sum_logits": -21.91516876220703, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -34.48227310180664, "logits_per_token": -2.4350187513563366, "logits_per_char": -0.48700375027126735, "num_chars": 45}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1059, "native_id": "MEAP_2005_5_15", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.463001251220703, "incorrect_loss_raw": 36.53802490234375, "correct_loss_per_char": 0.3483585141739755, "incorrect_loss_per_char": 0.6334895534515982, "correct_loss_per_token": 1.678454659201882, "incorrect_loss_per_token": 2.9685591715582866, "correct_loss_uncond": -4.106180191040039, "incorrect_loss_uncond": -11.001871744791666}, "model_output": [{"sum_logits": -32.53953552246094, "num_tokens": 11, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -42.393959045410156, "logits_per_token": -2.958139592950994, "logits_per_char": -0.664072153519611, "num_chars": 49}, {"sum_logits": -18.463001251220703, "num_tokens": 11, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -22.569181442260742, "logits_per_token": -1.678454659201882, "logits_per_char": -0.3483585141739755, "num_chars": 53}, {"sum_logits": -37.14595031738281, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -48.24146270751953, "logits_per_token": -3.095495859781901, "logits_per_char": -0.6404474192652209, "num_chars": 58}, {"sum_logits": -39.9285888671875, "num_tokens": 14, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -51.98426818847656, "logits_per_token": -2.8520420619419644, "logits_per_char": -0.5959490875699627, "num_chars": 67}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1060, "native_id": "Mercury_7016258", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.227585792541504, "incorrect_loss_raw": 13.611608823140463, "correct_loss_per_char": 0.3118773831261529, "incorrect_loss_per_char": 0.3915005339201165, "correct_loss_per_token": 1.603940827505929, "incorrect_loss_per_token": 2.2686014705234103, "correct_loss_uncond": -21.706664085388184, "incorrect_loss_uncond": -15.285405158996582}, "model_output": [{"sum_logits": -8.229050636291504, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -28.335805892944336, "logits_per_token": -1.3715084393819172, "logits_per_char": -0.2571578323841095, "num_chars": 32}, {"sum_logits": -11.227585792541504, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -32.93424987792969, "logits_per_token": -1.603940827505929, "logits_per_char": -0.3118773831261529, "num_chars": 36}, {"sum_logits": -14.650995254516602, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -27.95104217529297, "logits_per_token": -2.4418325424194336, "logits_per_char": -0.41859986441476005, "num_chars": 35}, {"sum_logits": -17.95478057861328, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -30.404193878173828, "logits_per_token": -2.9924634297688804, "logits_per_char": -0.49874390496148, "num_chars": 36}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1061, "native_id": "NCEOGA_2013_8_5", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 22.93010139465332, "incorrect_loss_raw": 17.957529703776043, "correct_loss_per_char": 0.5211386680603027, "incorrect_loss_per_char": 0.6040306333511595, "correct_loss_per_token": 3.8216835657755532, "incorrect_loss_per_token": 4.171266937255859, "correct_loss_uncond": -9.969480514526367, "incorrect_loss_uncond": -10.593666076660156}, "model_output": [{"sum_logits": -22.93010139465332, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.89958190917969, "logits_per_token": -3.8216835657755532, "logits_per_char": -0.5211386680603027, "num_chars": 44}, {"sum_logits": -16.334671020507812, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -27.378238677978516, "logits_per_token": -4.083667755126953, "logits_per_char": -0.583381107875279, "num_chars": 28}, {"sum_logits": -18.45098876953125, "num_tokens": 4, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -29.57184600830078, "logits_per_token": -4.6127471923828125, "logits_per_char": -0.6833699544270834, "num_chars": 27}, {"sum_logits": -19.086929321289062, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -28.703502655029297, "logits_per_token": -3.8173858642578127, "logits_per_char": -0.5453408377511161, "num_chars": 35}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1062, "native_id": "Mercury_7015540", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 7.5874738693237305, "incorrect_loss_raw": 11.118603070576986, "correct_loss_per_char": 0.32989016823146655, "incorrect_loss_per_char": 0.4446541866995685, "correct_loss_per_token": 1.8968684673309326, "incorrect_loss_per_token": 2.0947184668646917, "correct_loss_uncond": -13.375784873962402, "incorrect_loss_uncond": -14.53577963511149}, "model_output": [{"sum_logits": -10.175333023071289, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.112350463867188, "logits_per_token": -2.035066604614258, "logits_per_char": -0.44240578361179517, "num_chars": 23}, {"sum_logits": -7.5874738693237305, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -20.963258743286133, "logits_per_token": -1.8968684673309326, "logits_per_char": -0.32989016823146655, "num_chars": 23}, {"sum_logits": -11.570282936096191, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -27.279333114624023, "logits_per_token": -2.3140565872192385, "logits_per_char": -0.4450108821575458, "num_chars": 26}, {"sum_logits": -11.610193252563477, "num_tokens": 6, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -23.57146453857422, "logits_per_token": -1.9350322087605794, "logits_per_char": -0.4465458943293645, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1063, "native_id": "Mercury_SC_414001", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.578838348388672, "incorrect_loss_raw": 30.87686538696289, "correct_loss_per_char": 0.41660335701955875, "incorrect_loss_per_char": 0.4269148576447106, "correct_loss_per_token": 1.9719225565592449, "incorrect_loss_per_token": 2.058457692464193, "correct_loss_uncond": -22.24768829345703, "incorrect_loss_uncond": -20.632788340250652}, "model_output": [{"sum_logits": -30.440044403076172, "num_tokens": 15, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -50.64046096801758, "logits_per_token": -2.0293362935384116, "logits_per_char": -0.41698690963118046, "num_chars": 73}, {"sum_logits": -32.735107421875, "num_tokens": 15, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -51.941612243652344, "logits_per_token": -2.1823404947916667, "logits_per_char": -0.4546542697482639, "num_chars": 72}, {"sum_logits": -29.4554443359375, "num_tokens": 15, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -51.9468879699707, "logits_per_token": -1.9636962890625, "logits_per_char": -0.4091033935546875, "num_chars": 72}, {"sum_logits": -29.578838348388672, "num_tokens": 15, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -51.8265266418457, "logits_per_token": -1.9719225565592449, "logits_per_char": -0.41660335701955875, "num_chars": 71}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1064, "native_id": "Mercury_7017973", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 18.757448196411133, "incorrect_loss_raw": 16.584584554036457, "correct_loss_per_char": 0.8155412259309188, "incorrect_loss_per_char": 0.9996351578656365, "correct_loss_per_token": 4.689362049102783, "incorrect_loss_per_token": 6.455811818440755, "correct_loss_uncond": -8.149908065795898, "incorrect_loss_uncond": -5.464317321777344}, "model_output": [{"sum_logits": -15.081104278564453, "num_tokens": 3, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -23.630962371826172, "logits_per_token": -5.027034759521484, "logits_per_char": -0.8871237810920266, "num_chars": 17}, {"sum_logits": -17.975543975830078, "num_tokens": 3, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -22.76665496826172, "logits_per_token": -5.991847991943359, "logits_per_char": -0.9986413319905599, "num_chars": 18}, {"sum_logits": -16.697105407714844, "num_tokens": 2, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -19.749088287353516, "logits_per_token": -8.348552703857422, "logits_per_char": -1.1131403605143229, "num_chars": 15}, {"sum_logits": -18.757448196411133, "num_tokens": 4, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -26.90735626220703, "logits_per_token": -4.689362049102783, "logits_per_char": -0.8155412259309188, "num_chars": 23}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1065, "native_id": "Mercury_407097", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 25.05532455444336, "incorrect_loss_raw": 26.084204991658527, "correct_loss_per_char": 0.41074302548267805, "incorrect_loss_per_char": 0.45915520756997763, "correct_loss_per_token": 2.08794371287028, "incorrect_loss_per_token": 2.0931909561157225, "correct_loss_uncond": -18.414989471435547, "incorrect_loss_uncond": -15.424965540568033}, "model_output": [{"sum_logits": -15.168272018432617, "num_tokens": 10, "num_tokens_all": 232, "is_greedy": false, "sum_logits_uncond": -30.454097747802734, "logits_per_token": -1.5168272018432618, "logits_per_char": -0.32272919188154503, "num_chars": 47}, {"sum_logits": -33.4273681640625, "num_tokens": 12, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -48.19601821899414, "logits_per_token": -2.785614013671875, "logits_per_char": -0.6428340031550481, "num_chars": 52}, {"sum_logits": -25.05532455444336, "num_tokens": 12, "num_tokens_all": 234, "is_greedy": false, "sum_logits_uncond": -43.470314025878906, "logits_per_token": -2.08794371287028, "logits_per_char": -0.41074302548267805, "num_chars": 61}, {"sum_logits": -29.65697479248047, "num_tokens": 15, "num_tokens_all": 237, "is_greedy": false, "sum_logits_uncond": -45.87739562988281, "logits_per_token": -1.9771316528320313, "logits_per_char": -0.41190242767333984, "num_chars": 72}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1066, "native_id": "Mercury_SC_406794", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.499677658081055, "incorrect_loss_raw": 20.624465306599934, "correct_loss_per_char": 0.21110394795735676, "incorrect_loss_per_char": 0.4587929143878117, "correct_loss_per_token": 0.9499677658081055, "incorrect_loss_per_token": 2.407725802174321, "correct_loss_uncond": -17.28072166442871, "incorrect_loss_uncond": -9.055915196736654}, "model_output": [{"sum_logits": -20.46633529663086, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -27.934730529785156, "logits_per_token": -2.2740372551812067, "logits_per_char": -0.4651439840143377, "num_chars": 44}, {"sum_logits": -25.081605911254883, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -30.581172943115234, "logits_per_token": -3.1352007389068604, "logits_per_char": -0.5225334564844767, "num_chars": 48}, {"sum_logits": -9.499677658081055, "num_tokens": 10, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.780399322509766, "logits_per_token": -0.9499677658081055, "logits_per_char": -0.21110394795735676, "num_chars": 45}, {"sum_logits": -16.325454711914062, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -30.525238037109375, "logits_per_token": -1.8139394124348958, "logits_per_char": -0.38870130266462055, "num_chars": 42}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1067, "native_id": "Mercury_7227710", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.246922969818115, "incorrect_loss_raw": 6.972916126251221, "correct_loss_per_char": 0.4769929972561923, "incorrect_loss_per_char": 0.757381631354405, "correct_loss_per_token": 1.748974323272705, "incorrect_loss_per_token": 2.9842390060424804, "correct_loss_uncond": -12.345203876495361, "incorrect_loss_uncond": -9.782705783843994}, "model_output": [{"sum_logits": -8.033175468444824, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -16.553110122680664, "logits_per_token": -4.016587734222412, "logits_per_char": -0.7302886789495294, "num_chars": 11}, {"sum_logits": -7.863382339477539, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -15.444108009338379, "logits_per_token": -3.9316911697387695, "logits_per_char": -1.123340334211077, "num_chars": 7}, {"sum_logits": -5.022190570831299, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -18.2696475982666, "logits_per_token": -1.0044381141662597, "logits_per_char": -0.4185158809026082, "num_chars": 12}, {"sum_logits": -5.246922969818115, "num_tokens": 3, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -17.592126846313477, "logits_per_token": -1.748974323272705, "logits_per_char": -0.4769929972561923, "num_chars": 11}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1068, "native_id": "Mercury_SC_406710", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.844449996948242, "incorrect_loss_raw": 11.196218490600586, "correct_loss_per_char": 0.7229633331298828, "incorrect_loss_per_char": 0.8497489689391253, "correct_loss_per_token": 2.7111124992370605, "incorrect_loss_per_token": 3.1379703150855165, "correct_loss_uncond": -12.631603240966797, "incorrect_loss_uncond": -11.068458557128906}, "model_output": [{"sum_logits": -12.20096492767334, "num_tokens": 3, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -19.712369918823242, "logits_per_token": -4.066988309224446, "logits_per_char": -1.1091786297884854, "num_chars": 11}, {"sum_logits": -7.164741516113281, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -22.254749298095703, "logits_per_token": -1.7911853790283203, "logits_per_char": -0.5511339627779447, "num_chars": 13}, {"sum_logits": -10.844449996948242, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -23.47605323791504, "logits_per_token": -2.7111124992370605, "logits_per_char": -0.7229633331298828, "num_chars": 15}, {"sum_logits": -14.222949028015137, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -24.82691192626953, "logits_per_token": -3.555737257003784, "logits_per_char": -0.888934314250946, "num_chars": 16}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1069, "native_id": "Mercury_401926", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.21065902709961, "incorrect_loss_raw": 5.610649188359578, "correct_loss_per_char": 0.7105329513549805, "incorrect_loss_per_char": 0.5013198326795528, "correct_loss_per_token": 4.736886342366536, "incorrect_loss_per_token": 3.225737081633674, "correct_loss_uncond": -9.729118347167969, "incorrect_loss_uncond": -9.904698928197226}, "model_output": [{"sum_logits": -5.156352996826172, "num_tokens": 1, "num_tokens_all": 245, "is_greedy": false, "sum_logits_uncond": -14.149662017822266, "logits_per_token": -5.156352996826172, "logits_per_char": -0.6445441246032715, "num_chars": 8}, {"sum_logits": -7.901634216308594, "num_tokens": 3, "num_tokens_all": 247, "is_greedy": false, "sum_logits_uncond": -18.173580169677734, "logits_per_token": -2.6338780721028647, "logits_per_char": -0.6078180166391226, "num_chars": 13}, {"sum_logits": -3.7739603519439697, "num_tokens": 2, "num_tokens_all": 246, "is_greedy": false, "sum_logits_uncond": -14.22280216217041, "logits_per_token": -1.8869801759719849, "logits_per_char": -0.25159735679626466, "num_chars": 15}, {"sum_logits": -14.21065902709961, "num_tokens": 3, "num_tokens_all": 247, "is_greedy": false, "sum_logits_uncond": -23.939777374267578, "logits_per_token": -4.736886342366536, "logits_per_char": -0.7105329513549805, "num_chars": 20}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1070, "native_id": "MCAS_2014_5_15", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 17.12124252319336, "incorrect_loss_raw": 23.356674830118816, "correct_loss_per_char": 0.31706004672580296, "incorrect_loss_per_char": 0.4320594426863705, "correct_loss_per_token": 1.9023602803548176, "incorrect_loss_per_token": 2.1170069376627603, "correct_loss_uncond": -10.396951675415039, "incorrect_loss_uncond": -12.71868896484375}, "model_output": [{"sum_logits": -26.021984100341797, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -37.445228576660156, "logits_per_token": -2.168498675028483, "logits_per_char": -0.49098083208192067, "num_chars": 53}, {"sum_logits": -18.42667579650879, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -31.489168167114258, "logits_per_token": -2.04740842183431, "logits_per_char": -0.3476731282360149, "num_chars": 53}, {"sum_logits": -25.62136459350586, "num_tokens": 12, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -39.29169464111328, "logits_per_token": -2.1351137161254883, "logits_per_char": -0.45752436774117605, "num_chars": 56}, {"sum_logits": -17.12124252319336, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.5181941986084, "logits_per_token": -1.9023602803548176, "logits_per_char": -0.31706004672580296, "num_chars": 54}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1071, "native_id": "Mercury_LBS10151", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.6401515007019043, "incorrect_loss_raw": 5.5024309158325195, "correct_loss_per_char": 0.9100378751754761, "incorrect_loss_per_char": 0.670276333036877, "correct_loss_per_token": 3.6401515007019043, "incorrect_loss_per_token": 4.493315696716309, "correct_loss_uncond": -8.037262439727783, "incorrect_loss_uncond": -8.182546933492025}, "model_output": [{"sum_logits": -3.6401515007019043, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -11.677413940429688, "logits_per_token": -3.6401515007019043, "logits_per_char": -0.9100378751754761, "num_chars": 4}, {"sum_logits": -4.786073684692383, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.44937515258789, "logits_per_token": -4.786073684692383, "logits_per_char": -0.7976789474487305, "num_chars": 6}, {"sum_logits": -5.66652774810791, "num_tokens": 1, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -13.479253768920898, "logits_per_token": -5.66652774810791, "logits_per_char": -0.8095039640154157, "num_chars": 7}, {"sum_logits": -6.054691314697266, "num_tokens": 2, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -14.126304626464844, "logits_per_token": -3.027345657348633, "logits_per_char": -0.40364608764648435, "num_chars": 15}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1072, "native_id": "ACTAAP_2013_5_8", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.923652648925781, "incorrect_loss_raw": 3.0433714787165322, "correct_loss_per_char": 0.6581836276584201, "incorrect_loss_per_char": 0.35742832201498526, "correct_loss_per_token": 5.923652648925781, "incorrect_loss_per_token": 2.7144784728686013, "correct_loss_uncond": -7.260749816894531, "incorrect_loss_uncond": -9.012813925743103}, "model_output": [{"sum_logits": -2.619586944580078, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -2.619586944580078, "logits_per_char": -0.4365978240966797, "num_chars": 6}, {"sum_logits": -5.923652648925781, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -13.184402465820312, "logits_per_token": -5.923652648925781, "logits_per_char": -0.6581836276584201, "num_chars": 9}, {"sum_logits": -4.537169456481934, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -4.537169456481934, "logits_per_char": -0.5041299396091037, "num_chars": 9}, {"sum_logits": -1.9733580350875854, "num_tokens": 2, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -12.253585815429688, "logits_per_token": -0.9866790175437927, "logits_per_char": -0.13155720233917237, "num_chars": 15}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1073, "native_id": "Mercury_SC_407592", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.354538917541504, "incorrect_loss_raw": 3.8195362091064453, "correct_loss_per_char": 0.8709077835083008, "incorrect_loss_per_char": 0.6246613449520534, "correct_loss_per_token": 4.354538917541504, "incorrect_loss_per_token": 3.8195362091064453, "correct_loss_uncond": -8.109126091003418, "incorrect_loss_uncond": -8.419477462768555}, "model_output": [{"sum_logits": -4.354538917541504, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -12.463665008544922, "logits_per_token": -4.354538917541504, "logits_per_char": -0.8709077835083008, "num_chars": 5}, {"sum_logits": -2.8583836555480957, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -14.025976181030273, "logits_per_token": -2.8583836555480957, "logits_per_char": -0.3175981839497884, "num_chars": 9}, {"sum_logits": -4.909774303436279, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -4.909774303436279, "logits_per_char": -0.8182957172393799, "num_chars": 6}, {"sum_logits": -3.690450668334961, "num_tokens": 1, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -12.802070617675781, "logits_per_token": -3.690450668334961, "logits_per_char": -0.7380901336669922, "num_chars": 5}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1074, "native_id": "TIMSS_1995_8_L6", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 36.18370819091797, "incorrect_loss_raw": 40.080588022867836, "correct_loss_per_char": 0.3849330658608295, "incorrect_loss_per_char": 0.45906731585721666, "correct_loss_per_token": 1.8091854095458983, "incorrect_loss_per_token": 2.0070288314562457, "correct_loss_uncond": -20.25762176513672, "incorrect_loss_uncond": -17.281880696614582}, "model_output": [{"sum_logits": -44.923526763916016, "num_tokens": 22, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -63.09601593017578, "logits_per_token": -2.0419784892689097, "logits_per_char": -0.47287922909385277, "num_chars": 95}, {"sum_logits": -38.37530517578125, "num_tokens": 18, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -51.4471435546875, "logits_per_token": -2.131961398654514, "logits_per_char": -0.4796913146972656, "num_chars": 80}, {"sum_logits": -36.94293212890625, "num_tokens": 20, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -57.544246673583984, "logits_per_token": -1.8471466064453126, "logits_per_char": -0.4246314037805316, "num_chars": 87}, {"sum_logits": -36.18370819091797, "num_tokens": 20, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -56.44132995605469, "logits_per_token": -1.8091854095458983, "logits_per_char": -0.3849330658608295, "num_chars": 94}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1075, "native_id": "Mercury_7233398", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.076112747192383, "incorrect_loss_raw": 19.13845952351888, "correct_loss_per_char": 0.5155706147889834, "incorrect_loss_per_char": 0.5388415673849613, "correct_loss_per_token": 2.384514093399048, "incorrect_loss_per_token": 2.4004820528484525, "correct_loss_uncond": -15.789709091186523, "incorrect_loss_uncond": -17.373775482177734}, "model_output": [{"sum_logits": -14.959026336669922, "num_tokens": 7, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -31.926589965820312, "logits_per_token": -2.1370037623814175, "logits_per_char": -0.4274007524762835, "num_chars": 35}, {"sum_logits": -17.467317581176758, "num_tokens": 9, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -37.242576599121094, "logits_per_token": -1.9408130645751953, "logits_per_char": -0.5137446347404929, "num_chars": 34}, {"sum_logits": -19.076112747192383, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -34.865821838378906, "logits_per_token": -2.384514093399048, "logits_per_char": -0.5155706147889834, "num_chars": 37}, {"sum_logits": -24.98903465270996, "num_tokens": 8, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -40.36753845214844, "logits_per_token": -3.123629331588745, "logits_per_char": -0.6753793149381071, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1076, "native_id": "Mercury_407664", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.07220458984375, "incorrect_loss_raw": 14.225175857543945, "correct_loss_per_char": 0.8410388577368951, "incorrect_loss_per_char": 0.8500407327924456, "correct_loss_per_token": 5.21444091796875, "incorrect_loss_per_token": 4.037299346923828, "correct_loss_uncond": -4.124881744384766, "incorrect_loss_uncond": -6.413389205932617}, "model_output": [{"sum_logits": -13.165609359741211, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -19.217113494873047, "logits_per_token": -4.38853645324707, "logits_per_char": -0.9404006685529437, "num_chars": 14}, {"sum_logits": -13.660334587097168, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -19.450624465942383, "logits_per_token": -4.553444862365723, "logits_per_char": -0.9757381847926548, "num_chars": 14}, {"sum_logits": -15.849583625793457, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -23.247957229614258, "logits_per_token": -3.1699167251586915, "logits_per_char": -0.6339833450317383, "num_chars": 25}, {"sum_logits": -26.07220458984375, "num_tokens": 5, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -30.197086334228516, "logits_per_token": -5.21444091796875, "logits_per_char": -0.8410388577368951, "num_chars": 31}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1077, "native_id": "Mercury_SC_408657", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 19.219161987304688, "incorrect_loss_raw": 18.399538040161133, "correct_loss_per_char": 0.5491189139229911, "incorrect_loss_per_char": 0.6398672467812725, "correct_loss_per_token": 3.8438323974609374, "incorrect_loss_per_token": 3.239167513166155, "correct_loss_uncond": -12.416946411132812, "incorrect_loss_uncond": -10.079348882039389}, "model_output": [{"sum_logits": -16.494400024414062, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -26.722169876098633, "logits_per_token": -3.2988800048828124, "logits_per_char": -0.6597760009765625, "num_chars": 25}, {"sum_logits": -15.565359115600586, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -27.162763595581055, "logits_per_token": -3.113071823120117, "logits_per_char": -0.5367365212276064, "num_chars": 29}, {"sum_logits": -23.13885498046875, "num_tokens": 7, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -31.551727294921875, "logits_per_token": -3.3055507114955356, "logits_per_char": -0.7230892181396484, "num_chars": 32}, {"sum_logits": -19.219161987304688, "num_tokens": 5, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -31.6361083984375, "logits_per_token": -3.8438323974609374, "logits_per_char": -0.5491189139229911, "num_chars": 35}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1078, "native_id": "Mercury_7142800", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 29.177839279174805, "incorrect_loss_raw": 27.880318959554035, "correct_loss_per_char": 0.8581717435051414, "incorrect_loss_per_char": 0.7564343668383157, "correct_loss_per_token": 4.862973213195801, "incorrect_loss_per_token": 3.2110546475365047, "correct_loss_uncond": -12.768949508666992, "incorrect_loss_uncond": -13.606197357177734}, "model_output": [{"sum_logits": -19.183700561523438, "num_tokens": 8, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -31.859371185302734, "logits_per_token": -2.3979625701904297, "logits_per_char": -0.6615069159146013, "num_chars": 29}, {"sum_logits": -29.177839279174805, "num_tokens": 6, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -41.9467887878418, "logits_per_token": -4.862973213195801, "logits_per_char": -0.8581717435051414, "num_chars": 34}, {"sum_logits": -18.421100616455078, "num_tokens": 7, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -27.395862579345703, "logits_per_token": -2.6315858023507253, "logits_per_char": -0.5116972393459744, "num_chars": 36}, {"sum_logits": -46.036155700683594, "num_tokens": 10, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -65.20431518554688, "logits_per_token": -4.603615570068359, "logits_per_char": -1.0960989452543712, "num_chars": 42}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1079, "native_id": "Mercury_SC_410837", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 17.29927635192871, "incorrect_loss_raw": 26.013036092122395, "correct_loss_per_char": 0.3680697096155045, "incorrect_loss_per_char": 0.5218154802776519, "correct_loss_per_token": 1.4416063626607258, "incorrect_loss_per_token": 2.7892236709594727, "correct_loss_uncond": -17.638898849487305, "incorrect_loss_uncond": -16.28972880045573}, "model_output": [{"sum_logits": -29.6781063079834, "num_tokens": 9, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -41.68765640258789, "logits_per_token": -3.297567367553711, "logits_per_char": -0.6595134735107422, "num_chars": 45}, {"sum_logits": -17.29927635192871, "num_tokens": 12, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -34.938175201416016, "logits_per_token": -1.4416063626607258, "logits_per_char": -0.3680697096155045, "num_chars": 47}, {"sum_logits": -19.76036834716797, "num_tokens": 8, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -36.74187469482422, "logits_per_token": -2.470046043395996, "logits_per_char": -0.3952073669433594, "num_chars": 50}, {"sum_logits": -28.60063362121582, "num_tokens": 11, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -48.478763580322266, "logits_per_token": -2.600057601928711, "logits_per_char": -0.510725600378854, "num_chars": 56}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1080, "native_id": "Mercury_7154315", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.411500930786133, "incorrect_loss_raw": 28.63377316792806, "correct_loss_per_char": 0.5079134794381949, "incorrect_loss_per_char": 0.5273976313323493, "correct_loss_per_token": 2.6411500930786134, "incorrect_loss_per_token": 2.9861725893887603, "correct_loss_uncond": -10.573251724243164, "incorrect_loss_uncond": -12.33030637105306}, "model_output": [{"sum_logits": -26.411500930786133, "num_tokens": 10, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -36.9847526550293, "logits_per_token": -2.6411500930786134, "logits_per_char": -0.5079134794381949, "num_chars": 52}, {"sum_logits": -25.2032470703125, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -40.014976501464844, "logits_per_token": -2.8003607855902777, "logits_per_char": -0.5143519810267857, "num_chars": 49}, {"sum_logits": -31.68744468688965, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -43.57567596435547, "logits_per_token": -3.5208271874321833, "logits_per_char": -0.5761353579434482, "num_chars": 55}, {"sum_logits": -29.01062774658203, "num_tokens": 11, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -39.30158615112305, "logits_per_token": -2.637329795143821, "logits_per_char": -0.49170555502681407, "num_chars": 59}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1081, "native_id": "Mercury_7239628", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.560173988342285, "incorrect_loss_raw": 16.114739735921223, "correct_loss_per_char": 0.4550054371356964, "incorrect_loss_per_char": 0.6222082595402222, "correct_loss_per_token": 2.912034797668457, "incorrect_loss_per_token": 3.210038661956787, "correct_loss_uncond": -19.25039577484131, "incorrect_loss_uncond": -16.513487497965496}, "model_output": [{"sum_logits": -18.872953414916992, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -29.924636840820312, "logits_per_token": -4.718238353729248, "logits_per_char": -0.8578615188598633, "num_chars": 22}, {"sum_logits": -10.909688949584961, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -30.37767791748047, "logits_per_token": -1.8182814915974934, "logits_per_char": -0.2948564580968908, "num_chars": 37}, {"sum_logits": -18.56157684326172, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -37.582366943359375, "logits_per_token": -3.0935961405436196, "logits_per_char": -0.7139068016639123, "num_chars": 26}, {"sum_logits": -14.560173988342285, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.810569763183594, "logits_per_token": -2.912034797668457, "logits_per_char": -0.4550054371356964, "num_chars": 32}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1082, "native_id": "Mercury_401241", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.140069007873535, "incorrect_loss_raw": 13.034507751464844, "correct_loss_per_char": 0.5070034503936768, "incorrect_loss_per_char": 0.5480520398276193, "correct_loss_per_token": 2.535017251968384, "incorrect_loss_per_token": 3.258626937866211, "correct_loss_uncond": -9.713257789611816, "incorrect_loss_uncond": -8.963801701863607}, "model_output": [{"sum_logits": -10.140069007873535, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -19.85332679748535, "logits_per_token": -2.535017251968384, "logits_per_char": -0.5070034503936768, "num_chars": 20}, {"sum_logits": -13.399944305419922, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -18.20255470275879, "logits_per_token": -3.3499860763549805, "logits_per_char": -0.6699972152709961, "num_chars": 20}, {"sum_logits": -13.107253074645996, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -24.71080780029297, "logits_per_token": -3.276813268661499, "logits_per_char": -0.5242901229858399, "num_chars": 25}, {"sum_logits": -12.596325874328613, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -23.081565856933594, "logits_per_token": -3.1490814685821533, "logits_per_char": -0.4498687812260219, "num_chars": 28}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1083, "native_id": "Mercury_SC_408251", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.769591808319092, "incorrect_loss_raw": 11.079013188680014, "correct_loss_per_char": 0.21154974400997162, "incorrect_loss_per_char": 0.3112355371765781, "correct_loss_per_token": 0.8461989760398865, "incorrect_loss_per_token": 1.4779525768189206, "correct_loss_uncond": -19.01150369644165, "incorrect_loss_uncond": -18.517260869344074}, "model_output": [{"sum_logits": -6.769591808319092, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -25.781095504760742, "logits_per_token": -0.8461989760398865, "logits_per_char": -0.21154974400997162, "num_chars": 32}, {"sum_logits": -5.970368385314941, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -24.40557098388672, "logits_per_token": -0.7462960481643677, "logits_per_char": -0.1755990701563218, "num_chars": 34}, {"sum_logits": -11.629915237426758, "num_tokens": 8, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -30.786510467529297, "logits_per_token": -1.4537394046783447, "logits_per_char": -0.2982029548058143, "num_chars": 39}, {"sum_logits": -15.63675594329834, "num_tokens": 7, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -33.59674072265625, "logits_per_token": -2.2338222776140486, "logits_per_char": -0.45990458656759825, "num_chars": 34}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1084, "native_id": "Mercury_7175893", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.095457077026367, "incorrect_loss_raw": 8.487207253774008, "correct_loss_per_char": 0.23161168531938034, "incorrect_loss_per_char": 0.34506882643847736, "correct_loss_per_token": 1.0190914154052735, "incorrect_loss_per_token": 1.6974414507548012, "correct_loss_uncond": -13.611059188842773, "incorrect_loss_uncond": -12.415388902028402}, "model_output": [{"sum_logits": -9.250487327575684, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -21.75225067138672, "logits_per_token": -1.8500974655151368, "logits_per_char": -0.33037454741341726, "num_chars": 28}, {"sum_logits": -8.875004768371582, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -20.687116622924805, "logits_per_token": -1.7750009536743163, "logits_per_char": -0.3858697725378949, "num_chars": 23}, {"sum_logits": -7.336129665374756, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -20.268421173095703, "logits_per_token": -1.467225933074951, "logits_per_char": -0.3189621593641198, "num_chars": 23}, {"sum_logits": -5.095457077026367, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -18.70651626586914, "logits_per_token": -1.0190914154052735, "logits_per_char": -0.23161168531938034, "num_chars": 22}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1085, "native_id": "Mercury_7202843", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 28.673330307006836, "incorrect_loss_raw": 21.4061336517334, "correct_loss_per_char": 0.49436776391391096, "incorrect_loss_per_char": 0.47910437828455216, "correct_loss_per_token": 2.2056407928466797, "incorrect_loss_per_token": 2.513634427388509, "correct_loss_uncond": -14.768022537231445, "incorrect_loss_uncond": -9.108774185180664}, "model_output": [{"sum_logits": -28.673330307006836, "num_tokens": 13, "num_tokens_all": 224, "is_greedy": false, "sum_logits_uncond": -43.44135284423828, "logits_per_token": -2.2056407928466797, "logits_per_char": -0.49436776391391096, "num_chars": 58}, {"sum_logits": -22.5924129486084, "num_tokens": 10, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -32.532310485839844, "logits_per_token": -2.25924129486084, "logits_per_char": -0.470675269762675, "num_chars": 48}, {"sum_logits": -20.94459342956543, "num_tokens": 9, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -29.218528747558594, "logits_per_token": -2.327177047729492, "logits_per_char": -0.4363456964492798, "num_chars": 48}, {"sum_logits": -20.681394577026367, "num_tokens": 7, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -29.79388427734375, "logits_per_token": -2.9544849395751953, "logits_per_char": -0.5302921686417017, "num_chars": 39}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1086, "native_id": "Mercury_7159023", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.780445098876953, "incorrect_loss_raw": 26.785324414571125, "correct_loss_per_char": 0.4188547134399414, "incorrect_loss_per_char": 0.4862127089411017, "correct_loss_per_token": 2.4200494554307728, "incorrect_loss_per_token": 3.025269622353191, "correct_loss_uncond": -14.155567169189453, "incorrect_loss_uncond": -11.58528995513916}, "model_output": [{"sum_logits": -28.329120635986328, "num_tokens": 8, "num_tokens_all": 230, "is_greedy": false, "sum_logits_uncond": -40.03950500488281, "logits_per_token": -3.541140079498291, "logits_per_char": -0.5665824127197265, "num_chars": 50}, {"sum_logits": -12.181584358215332, "num_tokens": 11, "num_tokens_all": 233, "is_greedy": false, "sum_logits_uncond": -25.643402099609375, "logits_per_token": -1.1074167598377576, "logits_per_char": -0.23885459525912417, "num_chars": 51}, {"sum_logits": -21.780445098876953, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -35.936012268066406, "logits_per_token": -2.4200494554307728, "logits_per_char": -0.4188547134399414, "num_chars": 52}, {"sum_logits": -39.84526824951172, "num_tokens": 9, "num_tokens_all": 231, "is_greedy": false, "sum_logits_uncond": -49.42893600463867, "logits_per_token": -4.427252027723524, "logits_per_char": -0.6532011188444544, "num_chars": 61}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1087, "native_id": "MDSA_2008_8_3", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 11.550529479980469, "incorrect_loss_raw": 8.874917984008789, "correct_loss_per_char": 0.23101058959960938, "incorrect_loss_per_char": 0.3675638320372869, "correct_loss_per_token": 1.155052947998047, "incorrect_loss_per_token": 1.8101151254442003, "correct_loss_uncond": -17.357986450195312, "incorrect_loss_uncond": -14.37336794535319}, "model_output": [{"sum_logits": -8.246236801147461, "num_tokens": 4, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -25.720417022705078, "logits_per_token": -2.0615592002868652, "logits_per_char": -0.37482894550670276, "num_chars": 22}, {"sum_logits": -9.207517623901367, "num_tokens": 6, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -20.676788330078125, "logits_per_token": -1.5345862706502278, "logits_per_char": -0.4003268532131029, "num_chars": 23}, {"sum_logits": -9.170999526977539, "num_tokens": 5, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -23.347652435302734, "logits_per_token": -1.8341999053955078, "logits_per_char": -0.32753569739205496, "num_chars": 28}, {"sum_logits": -11.550529479980469, "num_tokens": 10, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -28.90851593017578, "logits_per_token": -1.155052947998047, "logits_per_char": -0.23101058959960938, "num_chars": 50}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1088, "native_id": "Mercury_7218348", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 8.083271980285645, "incorrect_loss_raw": 16.928030649820965, "correct_loss_per_char": 0.2886882850102016, "incorrect_loss_per_char": 0.4405217127729906, "correct_loss_per_token": 1.616654396057129, "incorrect_loss_per_token": 2.4629348156944157, "correct_loss_uncond": -14.640364646911621, "incorrect_loss_uncond": -8.630913416544596}, "model_output": [{"sum_logits": -8.083271980285645, "num_tokens": 5, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -22.723636627197266, "logits_per_token": -1.616654396057129, "logits_per_char": -0.2886882850102016, "num_chars": 28}, {"sum_logits": -16.480213165283203, "num_tokens": 7, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -22.907073974609375, "logits_per_token": -2.354316166469029, "logits_per_char": -0.4847121519200942, "num_chars": 34}, {"sum_logits": -16.38779640197754, "num_tokens": 8, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -25.234905242919922, "logits_per_token": -2.0484745502471924, "logits_per_char": -0.4201999077430138, "num_chars": 39}, {"sum_logits": -17.91608238220215, "num_tokens": 6, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -28.534852981567383, "logits_per_token": -2.986013730367025, "logits_per_char": -0.41665307865586393, "num_chars": 43}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1089, "native_id": "Mercury_SC_406458", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.074745178222656, "incorrect_loss_raw": 17.98782952626546, "correct_loss_per_char": 0.3032970788343897, "incorrect_loss_per_char": 0.3972408829390577, "correct_loss_per_token": 1.786082797580295, "incorrect_loss_per_token": 2.2519303661174876, "correct_loss_uncond": -16.623291015625, "incorrect_loss_uncond": -11.905625979105631}, "model_output": [{"sum_logits": -18.737712860107422, "num_tokens": 7, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -31.209712982177734, "logits_per_token": -2.6768161228724887, "logits_per_char": -0.45701738683188836, "num_chars": 41}, {"sum_logits": -11.879992485046387, "num_tokens": 8, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -25.634449005126953, "logits_per_token": -1.4849990606307983, "logits_per_char": -0.2582607061966606, "num_chars": 46}, {"sum_logits": -23.345783233642578, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -32.836204528808594, "logits_per_token": -2.5939759148491754, "logits_per_char": -0.47644455578862405, "num_chars": 49}, {"sum_logits": -16.074745178222656, "num_tokens": 9, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -32.698036193847656, "logits_per_token": -1.786082797580295, "logits_per_char": -0.3032970788343897, "num_chars": 53}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1090, "native_id": "LEAP_2007_4_10280", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.311914443969727, "incorrect_loss_raw": 17.186670621236164, "correct_loss_per_char": 0.7092136714769446, "incorrect_loss_per_char": 0.7435597563732284, "correct_loss_per_token": 3.2623828887939452, "incorrect_loss_per_token": 3.437334124247233, "correct_loss_uncond": -11.132965087890625, "incorrect_loss_uncond": -14.263727506001791}, "model_output": [{"sum_logits": -16.311914443969727, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.44487953186035, "logits_per_token": -3.2623828887939452, "logits_per_char": -0.7092136714769446, "num_chars": 23}, {"sum_logits": -21.7868595123291, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -38.866573333740234, "logits_per_token": -4.35737190246582, "logits_per_char": -0.9903117960149591, "num_chars": 22}, {"sum_logits": -14.214396476745605, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -27.38873863220215, "logits_per_token": -2.842879295349121, "logits_per_char": -0.6180172381193741, "num_chars": 23}, {"sum_logits": -15.558755874633789, "num_tokens": 5, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -28.095882415771484, "logits_per_token": -3.111751174926758, "logits_per_char": -0.6223502349853516, "num_chars": 25}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1091, "native_id": "Mercury_7216965", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.3519392013549805, "incorrect_loss_raw": 10.428030808766684, "correct_loss_per_char": 0.29732995563083225, "incorrect_loss_per_char": 1.0179704712697553, "correct_loss_per_token": 2.6759696006774902, "incorrect_loss_per_token": 7.1337441603342695, "correct_loss_uncond": -10.812891960144043, "incorrect_loss_uncond": -4.883231321970622}, "model_output": [{"sum_logits": -6.680568218231201, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -17.408260345458984, "logits_per_token": -3.3402841091156006, "logits_per_char": -0.3711426787906223, "num_chars": 18}, {"sum_logits": -13.085151672363281, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -14.124065399169922, "logits_per_token": -6.542575836181641, "logits_per_char": -1.6356439590454102, "num_chars": 8}, {"sum_logits": -5.3519392013549805, "num_tokens": 2, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -16.164831161499023, "logits_per_token": -2.6759696006774902, "logits_per_char": -0.29732995563083225, "num_chars": 18}, {"sum_logits": -11.518372535705566, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.401460647583008, "logits_per_token": -11.518372535705566, "logits_per_char": -1.0471247759732334, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1092, "native_id": "NYSEDREGENTS_2010_8_42", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.20914649963379, "incorrect_loss_raw": 20.025389353434246, "correct_loss_per_char": 0.5438242692213792, "incorrect_loss_per_char": 0.5134715218829294, "correct_loss_per_token": 1.6314728076641376, "incorrect_loss_per_token": 1.5404145656487882, "correct_loss_uncond": -19.932252883911133, "incorrect_loss_uncond": -20.115514119466145}, "model_output": [{"sum_logits": -19.792531967163086, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -39.210384368896484, "logits_per_token": -1.522502459012545, "logits_per_char": -0.5075008196708484, "num_chars": 39}, {"sum_logits": -21.20914649963379, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -41.14139938354492, "logits_per_token": -1.6314728076641376, "logits_per_char": -0.5438242692213792, "num_chars": 39}, {"sum_logits": -20.402507781982422, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -40.76826477050781, "logits_per_token": -1.5694236755371094, "logits_per_char": -0.5231412251790365, "num_chars": 39}, {"sum_logits": -19.881128311157227, "num_tokens": 13, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -40.444061279296875, "logits_per_token": -1.5293175623967097, "logits_per_char": -0.5097725207989032, "num_chars": 39}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1093, "native_id": "LEAP__7_10351", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 17.598371505737305, "incorrect_loss_raw": 15.783425013224283, "correct_loss_per_char": 0.17598371505737304, "incorrect_loss_per_char": 0.21208940436243418, "correct_loss_per_token": 1.0351983238669002, "incorrect_loss_per_token": 1.2308888170454237, "correct_loss_uncond": -23.304624557495117, "incorrect_loss_uncond": -19.48050371805827}, "model_output": [{"sum_logits": -17.809045791625977, "num_tokens": 12, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -35.718345642089844, "logits_per_token": -1.4840871493021648, "logits_per_char": -0.26983402714584814, "num_chars": 66}, {"sum_logits": -17.388118743896484, "num_tokens": 12, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -36.7652702331543, "logits_per_token": -1.449009895324707, "logits_per_char": -0.24490308089995047, "num_chars": 71}, {"sum_logits": -17.598371505737305, "num_tokens": 17, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -40.90299606323242, "logits_per_token": -1.0351983238669002, "logits_per_char": -0.17598371505737304, "num_chars": 100}, {"sum_logits": -12.15311050415039, "num_tokens": 16, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -33.308170318603516, "logits_per_token": -0.7595694065093994, "logits_per_char": -0.12153110504150391, "num_chars": 100}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1094, "native_id": "Mercury_SC_400590", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.5825538635253906, "incorrect_loss_raw": 4.617693344751994, "correct_loss_per_char": 0.22390961647033691, "incorrect_loss_per_char": 0.3212244786913433, "correct_loss_per_token": 1.1941846211751301, "incorrect_loss_per_token": 1.5392311149173314, "correct_loss_uncond": -14.092823028564453, "incorrect_loss_uncond": -13.167563994725546}, "model_output": [{"sum_logits": -3.688122034072876, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.403579711914062, "logits_per_token": -1.2293740113576253, "logits_per_char": -0.26343728814806255, "num_chars": 14}, {"sum_logits": -4.740179061889648, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -18.03070831298828, "logits_per_token": -1.5800596872965496, "logits_per_char": -0.3385842187064035, "num_chars": 14}, {"sum_logits": -5.424778938293457, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.921483993530273, "logits_per_token": -1.808259646097819, "logits_per_char": -0.3616519292195638, "num_chars": 15}, {"sum_logits": -3.5825538635253906, "num_tokens": 3, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.675376892089844, "logits_per_token": -1.1941846211751301, "logits_per_char": -0.22390961647033691, "num_chars": 16}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1095, "native_id": "Mercury_7086608", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.967397212982178, "incorrect_loss_raw": 4.238917986551921, "correct_loss_per_char": 0.3483698606491089, "incorrect_loss_per_char": 0.33328363080310003, "correct_loss_per_token": 2.322465737660726, "incorrect_loss_per_token": 1.9248742792341444, "correct_loss_uncond": -14.020971775054932, "incorrect_loss_uncond": -12.831322034200033}, "model_output": [{"sum_logits": -3.5025248527526855, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -19.199729919433594, "logits_per_token": -1.1675082842508953, "logits_per_char": -0.17512624263763427, "num_chars": 20}, {"sum_logits": -6.967397212982178, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -20.98836898803711, "logits_per_token": -2.322465737660726, "logits_per_char": -0.3483698606491089, "num_chars": 20}, {"sum_logits": -5.8230485916137695, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.55515480041504, "logits_per_token": -2.9115242958068848, "logits_per_char": -0.44792681473952073, "num_chars": 13}, {"sum_logits": -3.3911805152893066, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -15.455835342407227, "logits_per_token": -1.6955902576446533, "logits_per_char": -0.3767978350321452, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1096, "native_id": "Mercury_7187863", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 15.221846580505371, "incorrect_loss_raw": 14.38978131612142, "correct_loss_per_char": 0.47568270564079285, "incorrect_loss_per_char": 1.105109093154686, "correct_loss_per_token": 3.0443693161010743, "incorrect_loss_per_token": 5.718162218729655, "correct_loss_uncond": -9.81779956817627, "incorrect_loss_uncond": -7.219653447469075}, "model_output": [{"sum_logits": -11.577988624572754, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -19.9693546295166, "logits_per_token": -5.788994312286377, "logits_per_char": -1.1577988624572755, "num_chars": 10}, {"sum_logits": -13.870614051818848, "num_tokens": 2, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -20.065155029296875, "logits_per_token": -6.935307025909424, "logits_per_char": -1.3870614051818848, "num_chars": 10}, {"sum_logits": -17.720741271972656, "num_tokens": 4, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -24.793794631958008, "logits_per_token": -4.430185317993164, "logits_per_char": -0.7704670118248981, "num_chars": 23}, {"sum_logits": -15.221846580505371, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -25.03964614868164, "logits_per_token": -3.0443693161010743, "logits_per_char": -0.47568270564079285, "num_chars": 32}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1097, "native_id": "Mercury_7120873", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 15.916743278503418, "incorrect_loss_raw": 15.926616350809732, "correct_loss_per_char": 0.41886166522377416, "incorrect_loss_per_char": 0.48361944256830275, "correct_loss_per_token": 2.273820468357631, "incorrect_loss_per_token": 2.3960403010958715, "correct_loss_uncond": -9.92485523223877, "incorrect_loss_uncond": -13.694563865661621}, "model_output": [{"sum_logits": -12.794139862060547, "num_tokens": 5, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -24.902259826660156, "logits_per_token": -2.5588279724121095, "logits_per_char": -0.5117655944824219, "num_chars": 25}, {"sum_logits": -14.340439796447754, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -25.194252014160156, "logits_per_token": -2.0486342566353932, "logits_per_char": -0.4097268513270787, "num_chars": 35}, {"sum_logits": -15.916743278503418, "num_tokens": 7, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -25.841598510742188, "logits_per_token": -2.273820468357631, "logits_per_char": -0.41886166522377416, "num_chars": 38}, {"sum_logits": -20.6452693939209, "num_tokens": 8, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -38.76702880859375, "logits_per_token": -2.5806586742401123, "logits_per_char": -0.5293658818954077, "num_chars": 39}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1098, "native_id": "Mercury_184730", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 26.718860626220703, "incorrect_loss_raw": 25.049907684326172, "correct_loss_per_char": 0.42410889882890007, "incorrect_loss_per_char": 0.40883262984516033, "correct_loss_per_token": 1.9084900447300501, "incorrect_loss_per_token": 1.9392805996104183, "correct_loss_uncond": -15.778114318847656, "incorrect_loss_uncond": -13.556060791015625}, "model_output": [{"sum_logits": -26.25312042236328, "num_tokens": 12, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -41.02516174316406, "logits_per_token": -2.18776003519694, "logits_per_char": -0.4234374261671497, "num_chars": 62}, {"sum_logits": -26.718860626220703, "num_tokens": 14, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -42.49697494506836, "logits_per_token": -1.9084900447300501, "logits_per_char": -0.42410889882890007, "num_chars": 63}, {"sum_logits": -25.019046783447266, "num_tokens": 13, "num_tokens_all": 202, "is_greedy": false, "sum_logits_uncond": -36.75959777832031, "logits_per_token": -1.9245420602651744, "logits_per_char": -0.4240516403974113, "num_chars": 59}, {"sum_logits": -23.87755584716797, "num_tokens": 14, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -38.033145904541016, "logits_per_token": -1.7055397033691406, "logits_per_char": -0.37900882297092015, "num_chars": 63}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1099, "native_id": "Mercury_SC_401265", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 20.381065368652344, "incorrect_loss_raw": 25.32252375284831, "correct_loss_per_char": 0.4159401095643335, "incorrect_loss_per_char": 0.5713794104060771, "correct_loss_per_token": 2.547633171081543, "incorrect_loss_per_token": 3.3383486384437195, "correct_loss_uncond": -14.224754333496094, "incorrect_loss_uncond": -12.814974466959635}, "model_output": [{"sum_logits": -25.208349227905273, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -37.313270568847656, "logits_per_token": -3.151043653488159, "logits_per_char": -0.6001987911406017, "num_chars": 42}, {"sum_logits": -21.68964958190918, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -37.179283142089844, "logits_per_token": -2.7112061977386475, "logits_per_char": -0.4819922129313151, "num_chars": 45}, {"sum_logits": -29.06957244873047, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -39.91994094848633, "logits_per_token": -4.152796064104352, "logits_per_char": -0.6319472271463146, "num_chars": 46}, {"sum_logits": -20.381065368652344, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -34.60581970214844, "logits_per_token": -2.547633171081543, "logits_per_char": -0.4159401095643335, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1100, "native_id": "OHAT_2009_8_34", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.125923156738281, "incorrect_loss_raw": 17.353506088256836, "correct_loss_per_char": 0.37814807891845703, "incorrect_loss_per_char": 0.4658240195808745, "correct_loss_per_token": 1.8907403945922852, "incorrect_loss_per_token": 2.380558649698893, "correct_loss_uncond": -14.646785736083984, "incorrect_loss_uncond": -13.007763544718424}, "model_output": [{"sum_logits": -15.218667984008789, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -27.261367797851562, "logits_per_token": -2.536444664001465, "logits_per_char": -0.47558337450027466, "num_chars": 32}, {"sum_logits": -18.633922576904297, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -30.121997833251953, "logits_per_token": -2.329240322113037, "logits_per_char": -0.4777928865872897, "num_chars": 39}, {"sum_logits": -18.207927703857422, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -33.700443267822266, "logits_per_token": -2.2759909629821777, "logits_per_char": -0.4440957976550591, "num_chars": 41}, {"sum_logits": -15.125923156738281, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -29.772708892822266, "logits_per_token": -1.8907403945922852, "logits_per_char": -0.37814807891845703, "num_chars": 40}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1101, "native_id": "Mercury_406639", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.992045402526855, "incorrect_loss_raw": 16.482271512349445, "correct_loss_per_char": 0.31558014217175934, "incorrect_loss_per_char": 0.40812990626911444, "correct_loss_per_token": 1.7131493432181222, "incorrect_loss_per_token": 2.1456171671549478, "correct_loss_uncond": -16.44747829437256, "incorrect_loss_uncond": -13.806792259216309}, "model_output": [{"sum_logits": -14.335982322692871, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -27.00363540649414, "logits_per_token": -2.04799747467041, "logits_per_char": -0.377262692702444, "num_chars": 38}, {"sum_logits": -11.992045402526855, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.439523696899414, "logits_per_token": -1.7131493432181222, "logits_per_char": -0.31558014217175934, "num_chars": 38}, {"sum_logits": -19.2086181640625, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.342315673828125, "logits_per_token": -2.4010772705078125, "logits_per_char": -0.4685028820503049, "num_chars": 41}, {"sum_logits": -15.902214050292969, "num_tokens": 8, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -30.521240234375, "logits_per_token": -1.987776756286621, "logits_per_char": -0.3786241440545945, "num_chars": 42}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1102, "native_id": "Mercury_7008610", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 21.43486976623535, "incorrect_loss_raw": 20.90619468688965, "correct_loss_per_char": 0.7938840654161241, "incorrect_loss_per_char": 0.747996058774634, "correct_loss_per_token": 3.5724782943725586, "incorrect_loss_per_token": 3.939697413974338, "correct_loss_uncond": -6.411199569702148, "incorrect_loss_uncond": -13.020143508911133}, "model_output": [{"sum_logits": -14.79214859008789, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -26.502323150634766, "logits_per_token": -2.958429718017578, "logits_per_char": -0.6163395245869955, "num_chars": 24}, {"sum_logits": -26.187698364257812, "num_tokens": 5, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -35.69255065917969, "logits_per_token": -5.237539672851563, "logits_per_char": -0.9030240815261315, "num_chars": 29}, {"sum_logits": -21.738737106323242, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -39.58414077758789, "logits_per_token": -3.6231228510538735, "logits_per_char": -0.7246245702107748, "num_chars": 30}, {"sum_logits": -21.43486976623535, "num_tokens": 6, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -27.8460693359375, "logits_per_token": -3.5724782943725586, "logits_per_char": -0.7938840654161241, "num_chars": 27}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1103, "native_id": "MCAS_2009_8_12", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.925769805908203, "incorrect_loss_raw": 14.978526751200357, "correct_loss_per_char": 0.9958761003282335, "incorrect_loss_per_char": 0.5546405654573533, "correct_loss_per_token": 4.481442451477051, "incorrect_loss_per_token": 2.6335087094988143, "correct_loss_uncond": -4.684072494506836, "incorrect_loss_uncond": -14.171126365661621}, "model_output": [{"sum_logits": -17.925769805908203, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -22.60984230041504, "logits_per_token": -4.481442451477051, "logits_per_char": -0.9958761003282335, "num_chars": 18}, {"sum_logits": -8.294095039367676, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -25.16518783569336, "logits_per_token": -1.6588190078735352, "logits_per_char": -0.377004319971258, "num_chars": 22}, {"sum_logits": -19.015323638916016, "num_tokens": 7, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -35.963539123535156, "logits_per_token": -2.716474805559431, "logits_per_char": -0.6791187013898577, "num_chars": 28}, {"sum_logits": -17.626161575317383, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -26.320232391357422, "logits_per_token": -3.5252323150634766, "logits_per_char": -0.6077986750109442, "num_chars": 29}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1104, "native_id": "MCAS_2005_8_12", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.701557159423828, "incorrect_loss_raw": 22.290685653686523, "correct_loss_per_char": 0.5857460644780373, "incorrect_loss_per_char": 0.4698646316778345, "correct_loss_per_token": 3.1890619066026478, "incorrect_loss_per_token": 2.7127047770868535, "correct_loss_uncond": -14.591899871826172, "incorrect_loss_uncond": -12.568903605143229}, "model_output": [{"sum_logits": -16.231916427612305, "num_tokens": 7, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -28.658241271972656, "logits_per_token": -2.318845203944615, "logits_per_char": -0.4271556954634817, "num_chars": 38}, {"sum_logits": -13.866252899169922, "num_tokens": 8, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -29.45857048034668, "logits_per_token": -1.7332816123962402, "logits_per_char": -0.3014402804167374, "num_chars": 46}, {"sum_logits": -28.701557159423828, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -43.29345703125, "logits_per_token": -3.1890619066026478, "logits_per_char": -0.5857460644780373, "num_chars": 49}, {"sum_logits": -36.773887634277344, "num_tokens": 9, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -46.46195602416992, "logits_per_token": -4.0859875149197045, "logits_per_char": -0.6809979191532841, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1105, "native_id": "ACTAAP_2008_7_4", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.349621295928955, "incorrect_loss_raw": 6.219817161560059, "correct_loss_per_char": 0.4187026619911194, "incorrect_loss_per_char": 0.7489883656854982, "correct_loss_per_token": 1.6748106479644775, "incorrect_loss_per_token": 3.1099085807800293, "correct_loss_uncond": -10.895061016082764, "incorrect_loss_uncond": -9.003579139709473}, "model_output": [{"sum_logits": -6.448133945465088, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.735342025756836, "logits_per_token": -3.224066972732544, "logits_per_char": -0.806016743183136, "num_chars": 8}, {"sum_logits": -3.349621295928955, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.244682312011719, "logits_per_token": -1.6748106479644775, "logits_per_char": -0.4187026619911194, "num_chars": 8}, {"sum_logits": -6.057741165161133, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -14.920890808105469, "logits_per_token": -3.0288705825805664, "logits_per_char": -0.7572176456451416, "num_chars": 8}, {"sum_logits": -6.153576374053955, "num_tokens": 2, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -16.01395606994629, "logits_per_token": -3.0767881870269775, "logits_per_char": -0.6837307082282172, "num_chars": 9}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1106, "native_id": "NYSEDREGENTS_2008_4_3", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 2.3748250007629395, "incorrect_loss_raw": 5.620771487553914, "correct_loss_per_char": 0.21589318188753995, "incorrect_loss_per_char": 0.973820734024048, "correct_loss_per_token": 2.3748250007629395, "incorrect_loss_per_token": 5.620771487553914, "correct_loss_uncond": -10.679587841033936, "incorrect_loss_uncond": -6.321356376012166}, "model_output": [{"sum_logits": -5.429473876953125, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -11.89634895324707, "logits_per_token": -5.429473876953125, "logits_per_char": -0.9049123128255209, "num_chars": 6}, {"sum_logits": -3.332293748855591, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -11.723394393920898, "logits_per_token": -3.332293748855591, "logits_per_char": -0.6664587497711182, "num_chars": 5}, {"sum_logits": -8.100546836853027, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -12.206640243530273, "logits_per_token": -8.100546836853027, "logits_per_char": -1.3500911394755046, "num_chars": 6}, {"sum_logits": -2.3748250007629395, "num_tokens": 1, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -13.054412841796875, "logits_per_token": -2.3748250007629395, "logits_per_char": -0.21589318188753995, "num_chars": 11}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1107, "native_id": "Mercury_SC_416181", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.8810603618621826, "incorrect_loss_raw": 7.494637648264567, "correct_loss_per_char": 0.25873735745747883, "incorrect_loss_per_char": 0.5722724142902332, "correct_loss_per_token": 1.9405301809310913, "incorrect_loss_per_token": 3.7473188241322837, "correct_loss_uncond": -11.819438219070435, "incorrect_loss_uncond": -9.359314123789469}, "model_output": [{"sum_logits": -6.805877685546875, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -17.496082305908203, "logits_per_token": -3.4029388427734375, "logits_per_char": -0.4003457462086397, "num_chars": 17}, {"sum_logits": -3.8810603618621826, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -15.700498580932617, "logits_per_token": -1.9405301809310913, "logits_per_char": -0.25873735745747883, "num_chars": 15}, {"sum_logits": -7.898518085479736, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -17.906299591064453, "logits_per_token": -3.949259042739868, "logits_per_char": -0.7180470986799761, "num_chars": 11}, {"sum_logits": -7.77951717376709, "num_tokens": 2, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -15.159473419189453, "logits_per_token": -3.889758586883545, "logits_per_char": -0.5984243979820838, "num_chars": 13}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1108, "native_id": "NYSEDREGENTS_2010_4_30", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.058387756347656, "incorrect_loss_raw": 16.936147054036457, "correct_loss_per_char": 0.26971677454506476, "incorrect_loss_per_char": 0.41789311003952473, "correct_loss_per_token": 1.382298469543457, "incorrect_loss_per_token": 2.1710950710155346, "correct_loss_uncond": -23.940757751464844, "incorrect_loss_uncond": -20.989981333414715}, "model_output": [{"sum_logits": -17.45629119873047, "num_tokens": 8, "num_tokens_all": 240, "is_greedy": false, "sum_logits_uncond": -37.8137321472168, "logits_per_token": -2.1820363998413086, "logits_per_char": -0.49875117710658484, "num_chars": 35}, {"sum_logits": -11.058387756347656, "num_tokens": 8, "num_tokens_all": 240, "is_greedy": false, "sum_logits_uncond": -34.9991455078125, "logits_per_token": -1.382298469543457, "logits_per_char": -0.26971677454506476, "num_chars": 41}, {"sum_logits": -19.701812744140625, "num_tokens": 7, "num_tokens_all": 239, "is_greedy": false, "sum_logits_uncond": -39.953521728515625, "logits_per_token": -2.814544677734375, "logits_per_char": -0.4581816917242006, "num_chars": 43}, {"sum_logits": -13.650337219238281, "num_tokens": 9, "num_tokens_all": 241, "is_greedy": false, "sum_logits_uncond": -36.011131286621094, "logits_per_token": -1.51670413547092, "logits_per_char": -0.2967464612877887, "num_chars": 46}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1109, "native_id": "Mercury_7025060", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 9.910179138183594, "incorrect_loss_raw": 9.71630859375, "correct_loss_per_char": 0.7623214721679688, "incorrect_loss_per_char": 0.7474083533653846, "correct_loss_per_token": 3.3033930460611978, "incorrect_loss_per_token": 3.23876953125, "correct_loss_uncond": -9.238348007202148, "incorrect_loss_uncond": -10.592891693115234}, "model_output": [{"sum_logits": -10.138790130615234, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.29128074645996, "logits_per_token": -3.379596710205078, "logits_per_char": -0.7799069331242487, "num_chars": 13}, {"sum_logits": -7.662879943847656, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -18.349395751953125, "logits_per_token": -2.5542933146158853, "logits_per_char": -0.5894523033728967, "num_chars": 13}, {"sum_logits": -11.34725570678711, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -21.286924362182617, "logits_per_token": -3.7824185689290366, "logits_per_char": -0.8728658235990084, "num_chars": 13}, {"sum_logits": -9.910179138183594, "num_tokens": 3, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -19.148527145385742, "logits_per_token": -3.3033930460611978, "logits_per_char": -0.7623214721679688, "num_chars": 13}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1110, "native_id": "Mercury_SC_402103", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 26.778615951538086, "incorrect_loss_raw": 22.53765042622884, "correct_loss_per_char": 0.6375860940842402, "incorrect_loss_per_char": 0.6143343130048157, "correct_loss_per_token": 2.9754017723931208, "incorrect_loss_per_token": 3.215192499614897, "correct_loss_uncond": -12.091058731079102, "incorrect_loss_uncond": -5.603121439615886}, "model_output": [{"sum_logits": -27.980579376220703, "num_tokens": 8, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -37.24513244628906, "logits_per_token": -3.497572422027588, "logits_per_char": -0.6507111482842024, "num_chars": 43}, {"sum_logits": -19.210390090942383, "num_tokens": 7, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -23.23553466796875, "logits_per_token": -2.7443414415631975, "logits_per_char": -0.6403463363647461, "num_chars": 30}, {"sum_logits": -20.421981811523438, "num_tokens": 6, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -23.941648483276367, "logits_per_token": -3.4036636352539062, "logits_per_char": -0.5519454543654984, "num_chars": 37}, {"sum_logits": -26.778615951538086, "num_tokens": 9, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -38.86967468261719, "logits_per_token": -2.9754017723931208, "logits_per_char": -0.6375860940842402, "num_chars": 42}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1111, "native_id": "VASoL_2009_5_37", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 21.117755889892578, "incorrect_loss_raw": 22.684528350830078, "correct_loss_per_char": 0.502803711664109, "incorrect_loss_per_char": 0.4504891328735761, "correct_loss_per_token": 2.6397194862365723, "incorrect_loss_per_token": 2.4314210074288503, "correct_loss_uncond": -16.395092010498047, "incorrect_loss_uncond": -14.03271738688151}, "model_output": [{"sum_logits": -25.625154495239258, "num_tokens": 11, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -41.33087921142578, "logits_per_token": -2.329559499567205, "logits_per_char": -0.42008449992195507, "num_chars": 61}, {"sum_logits": -21.107641220092773, "num_tokens": 11, "num_tokens_all": 220, "is_greedy": false, "sum_logits_uncond": -36.39738082885742, "logits_per_token": -1.9188764745538884, "logits_per_char": -0.3703094950893469, "num_chars": 57}, {"sum_logits": -21.117755889892578, "num_tokens": 8, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -37.512847900390625, "logits_per_token": -2.6397194862365723, "logits_per_char": -0.502803711664109, "num_chars": 42}, {"sum_logits": -21.320789337158203, "num_tokens": 7, "num_tokens_all": 216, "is_greedy": false, "sum_logits_uncond": -32.42347717285156, "logits_per_token": -3.0458270481654575, "logits_per_char": -0.5610734036094264, "num_chars": 38}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1112, "native_id": "Mercury_SC_402981", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.893791198730469, "incorrect_loss_raw": 16.14980634053548, "correct_loss_per_char": 0.5135790068527748, "incorrect_loss_per_char": 0.6949004951044819, "correct_loss_per_token": 2.9787582397460937, "incorrect_loss_per_token": 4.092950280507406, "correct_loss_uncond": -8.628305435180664, "incorrect_loss_uncond": -10.760558446248373}, "model_output": [{"sum_logits": -18.534149169921875, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -29.421987533569336, "logits_per_token": -3.706829833984375, "logits_per_char": -0.8424613259055398, "num_chars": 22}, {"sum_logits": -13.11844253540039, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.365375518798828, "logits_per_token": -4.372814178466797, "logits_per_char": -0.5703670667565387, "num_chars": 23}, {"sum_logits": -16.79682731628418, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -24.9437313079834, "logits_per_token": -4.199206829071045, "logits_per_char": -0.6718730926513672, "num_chars": 25}, {"sum_logits": -14.893791198730469, "num_tokens": 5, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -23.522096633911133, "logits_per_token": -2.9787582397460937, "logits_per_char": -0.5135790068527748, "num_chars": 29}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1113, "native_id": "NYSEDREGENTS_2008_8_5", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.871030807495117, "incorrect_loss_raw": 6.223374525705974, "correct_loss_per_char": 1.6236769358317058, "incorrect_loss_per_char": 1.7738550768958197, "correct_loss_per_token": 4.871030807495117, "incorrect_loss_per_token": 6.223374525705974, "correct_loss_uncond": -4.406402587890625, "incorrect_loss_uncond": -4.113168875376384}, "model_output": [{"sum_logits": -3.7525744438171387, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -8.377655029296875, "logits_per_token": -3.7525744438171387, "logits_per_char": -1.2508581479390461, "num_chars": 3}, {"sum_logits": -4.871030807495117, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -9.277433395385742, "logits_per_token": -4.871030807495117, "logits_per_char": -1.6236769358317058, "num_chars": 3}, {"sum_logits": -4.095837593078613, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -10.523298263549805, "logits_per_token": -4.095837593078613, "logits_per_char": -1.365279197692871, "num_chars": 3}, {"sum_logits": -10.821711540222168, "num_tokens": 1, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -12.10867691040039, "logits_per_token": -10.821711540222168, "logits_per_char": -2.705427885055542, "num_chars": 4}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1114, "native_id": "MCAS_1998_4_13", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.877291202545166, "incorrect_loss_raw": 7.141428311665853, "correct_loss_per_char": 0.3251527468363444, "incorrect_loss_per_char": 0.44043826527065705, "correct_loss_per_token": 2.438645601272583, "incorrect_loss_per_token": 3.5707141558329263, "correct_loss_uncond": -13.662518978118896, "incorrect_loss_uncond": -8.433998425801596}, "model_output": [{"sum_logits": -7.826972961425781, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -15.093343734741211, "logits_per_token": -3.9134864807128906, "logits_per_char": -0.5217981974283854, "num_chars": 15}, {"sum_logits": -4.877291202545166, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -18.539810180664062, "logits_per_token": -2.438645601272583, "logits_per_char": -0.3251527468363444, "num_chars": 15}, {"sum_logits": -6.351894378662109, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -17.04043960571289, "logits_per_token": -3.1759471893310547, "logits_per_char": -0.39699339866638184, "num_chars": 16}, {"sum_logits": -7.245417594909668, "num_tokens": 2, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -14.592496871948242, "logits_per_token": -3.622708797454834, "logits_per_char": -0.4025231997172038, "num_chars": 18}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1115, "native_id": "MDSA_2008_8_20", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 15.680586814880371, "incorrect_loss_raw": 20.072971026102703, "correct_loss_per_char": 0.6272234725952148, "incorrect_loss_per_char": 0.8254363662849725, "correct_loss_per_token": 3.136117362976074, "incorrect_loss_per_token": 3.869089689708891, "correct_loss_uncond": -13.818970680236816, "incorrect_loss_uncond": -12.979872067769369}, "model_output": [{"sum_logits": -19.619983673095703, "num_tokens": 4, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.430879592895508, "logits_per_token": -4.904995918273926, "logits_per_char": -0.9342849368140811, "num_chars": 21}, {"sum_logits": -15.79245662689209, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -30.081623077392578, "logits_per_token": -3.158491325378418, "logits_per_char": -0.6866285489953082, "num_chars": 23}, {"sum_logits": -15.680586814880371, "num_tokens": 5, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -29.499557495117188, "logits_per_token": -3.136117362976074, "logits_per_char": -0.6272234725952148, "num_chars": 25}, {"sum_logits": -24.806472778320312, "num_tokens": 7, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -37.646026611328125, "logits_per_token": -3.5437818254743303, "logits_per_char": -0.855395613045528, "num_chars": 29}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1116, "native_id": "Mercury_SC_400134", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 27.962322235107422, "incorrect_loss_raw": 8.213852087656656, "correct_loss_per_char": 0.7358505851344058, "incorrect_loss_per_char": 0.4712784423071587, "correct_loss_per_token": 3.1069246927897134, "incorrect_loss_per_token": 2.1907747824986776, "correct_loss_uncond": -6.100505828857422, "incorrect_loss_uncond": -9.133140087127686}, "model_output": [{"sum_logits": -11.71827220916748, "num_tokens": 3, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -17.456817626953125, "logits_per_token": -3.90609073638916, "logits_per_char": -0.8370194435119629, "num_chars": 14}, {"sum_logits": -1.6315360069274902, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": true, "sum_logits_uncond": -15.95824909210205, "logits_per_token": -0.40788400173187256, "logits_per_char": -0.08587031615407843, "num_chars": 19}, {"sum_logits": -11.291748046875, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -18.62590980529785, "logits_per_token": -2.258349609375, "logits_per_char": -0.49094556725543476, "num_chars": 23}, {"sum_logits": -27.962322235107422, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.062828063964844, "logits_per_token": -3.1069246927897134, "logits_per_char": -0.7358505851344058, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1117, "native_id": "Mercury_SC_LBS10265", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.3412604331970215, "incorrect_loss_raw": 3.4977382024129233, "correct_loss_per_char": 0.3946600393815474, "incorrect_loss_per_char": 0.33597797480496494, "correct_loss_per_token": 2.1706302165985107, "incorrect_loss_per_token": 1.7488691012064617, "correct_loss_uncond": -9.517451763153076, "incorrect_loss_uncond": -9.987624486287435}, "model_output": [{"sum_logits": -3.366612434387207, "num_tokens": 2, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -13.573923110961914, "logits_per_token": -1.6833062171936035, "logits_per_char": -0.3366612434387207, "num_chars": 10}, {"sum_logits": -2.5739731788635254, "num_tokens": 2, "num_tokens_all": 225, "is_greedy": true, "sum_logits_uncond": -14.103738784790039, "logits_per_token": -1.2869865894317627, "logits_per_char": -0.25739731788635256, "num_chars": 10}, {"sum_logits": -4.3412604331970215, "num_tokens": 2, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -13.858712196350098, "logits_per_token": -2.1706302165985107, "logits_per_char": -0.3946600393815474, "num_chars": 11}, {"sum_logits": -4.552628993988037, "num_tokens": 2, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -12.778426170349121, "logits_per_token": -2.2763144969940186, "logits_per_char": -0.41387536308982154, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1118, "native_id": "Mercury_7188580", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 28.02962303161621, "incorrect_loss_raw": 34.42314020792643, "correct_loss_per_char": 0.6228805118136935, "incorrect_loss_per_char": 0.6163854296193342, "correct_loss_per_token": 3.5037028789520264, "incorrect_loss_per_token": 3.1923166891541137, "correct_loss_uncond": -9.691305160522461, "incorrect_loss_uncond": -10.9766476949056}, "model_output": [{"sum_logits": -28.02962303161621, "num_tokens": 8, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -37.72092819213867, "logits_per_token": -3.5037028789520264, "logits_per_char": -0.6228805118136935, "num_chars": 45}, {"sum_logits": -24.564167022705078, "num_tokens": 12, "num_tokens_all": 208, "is_greedy": false, "sum_logits_uncond": -38.4522819519043, "logits_per_token": -2.0470139185587564, "logits_per_char": -0.44662121859463777, "num_chars": 55}, {"sum_logits": -41.240440368652344, "num_tokens": 10, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -45.993125915527344, "logits_per_token": -4.124044036865234, "logits_per_char": -0.7781215163896669, "num_chars": 53}, {"sum_logits": -37.464813232421875, "num_tokens": 11, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -51.75395584106445, "logits_per_token": -3.405892112038352, "logits_per_char": -0.6244135538736979, "num_chars": 60}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1119, "native_id": "Mercury_402348", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.872028350830078, "incorrect_loss_raw": 8.507163365681967, "correct_loss_per_char": 2.2180070877075195, "incorrect_loss_per_char": 1.8579125139448378, "correct_loss_per_token": 2.957342783610026, "incorrect_loss_per_token": 2.835721121893989, "correct_loss_uncond": -3.182215690612793, "incorrect_loss_uncond": -4.671471277872722}, "model_output": [{"sum_logits": -8.872028350830078, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.054244041442871, "logits_per_token": -2.957342783610026, "logits_per_char": -2.2180070877075195, "num_chars": 4}, {"sum_logits": -7.708705902099609, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.09591293334961, "logits_per_token": -2.569568634033203, "logits_per_char": -1.9271764755249023, "num_chars": 4}, {"sum_logits": -8.133164405822754, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -12.875614166259766, "logits_per_token": -2.711054801940918, "logits_per_char": -2.0332911014556885, "num_chars": 4}, {"sum_logits": -9.679619789123535, "num_tokens": 3, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -13.564376831054688, "logits_per_token": -3.226539929707845, "logits_per_char": -1.6132699648539226, "num_chars": 6}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1120, "native_id": "Mercury_7030555", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.532256603240967, "incorrect_loss_raw": 11.584138870239258, "correct_loss_per_char": 0.24193542974966545, "incorrect_loss_per_char": 0.4275345217797064, "correct_loss_per_token": 1.3064513206481934, "incorrect_loss_per_token": 1.9958459748162163, "correct_loss_uncond": -20.429068088531494, "incorrect_loss_uncond": -19.02743975321452}, "model_output": [{"sum_logits": -15.003836631774902, "num_tokens": 6, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -33.50176239013672, "logits_per_token": -2.5006394386291504, "logits_per_char": -0.6001534652709961, "num_chars": 25}, {"sum_logits": -5.8640546798706055, "num_tokens": 5, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -26.395112991333008, "logits_per_token": -1.172810935974121, "logits_per_char": -0.23456218719482422, "num_chars": 25}, {"sum_logits": -6.532256603240967, "num_tokens": 5, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -26.96132469177246, "logits_per_token": -1.3064513206481934, "logits_per_char": -0.24193542974966545, "num_chars": 27}, {"sum_logits": -13.884525299072266, "num_tokens": 6, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -31.9378604888916, "logits_per_token": -2.3140875498453775, "logits_per_char": -0.4478879128732989, "num_chars": 31}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1121, "native_id": "Mercury_SC_415453", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 7.990859031677246, "incorrect_loss_raw": 7.678387641906738, "correct_loss_per_char": 0.28538782255990164, "incorrect_loss_per_char": 0.24230160156705038, "correct_loss_per_token": 1.3318098386128743, "incorrect_loss_per_token": 1.3168160975925505, "correct_loss_uncond": -18.50126552581787, "incorrect_loss_uncond": -17.57549063364665}, "model_output": [{"sum_logits": -7.990859031677246, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -26.492124557495117, "logits_per_token": -1.3318098386128743, "logits_per_char": -0.28538782255990164, "num_chars": 28}, {"sum_logits": -7.195209980010986, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.273120880126953, "logits_per_token": -1.0278871400015694, "logits_per_char": -0.21803666606093897, "num_chars": 33}, {"sum_logits": -8.477069854736328, "num_tokens": 5, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -25.455184936523438, "logits_per_token": -1.6954139709472655, "logits_per_char": -0.29231275361159753, "num_chars": 29}, {"sum_logits": -7.3628830909729, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -25.033329010009766, "logits_per_token": -1.2271471818288167, "logits_per_char": -0.2165553850286147, "num_chars": 34}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1122, "native_id": "Mercury_7074848", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.948795318603516, "incorrect_loss_raw": 12.039951006571451, "correct_loss_per_char": 0.734147122031764, "incorrect_loss_per_char": 0.6317746994987367, "correct_loss_per_token": 3.487198829650879, "incorrect_loss_per_token": 2.5384352525075276, "correct_loss_uncond": -11.239898681640625, "incorrect_loss_uncond": -10.155194918314615}, "model_output": [{"sum_logits": -13.948795318603516, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -25.18869400024414, "logits_per_token": -3.487198829650879, "logits_per_char": -0.734147122031764, "num_chars": 19}, {"sum_logits": -11.025436401367188, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -19.709564208984375, "logits_per_token": -2.2050872802734376, "logits_per_char": -0.5512718200683594, "num_chars": 20}, {"sum_logits": -7.826703071594238, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -22.714111328125, "logits_per_token": -1.9566757678985596, "logits_per_char": -0.5217802047729492, "num_chars": 15}, {"sum_logits": -17.26771354675293, "num_tokens": 5, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -24.161762237548828, "logits_per_token": -3.453542709350586, "logits_per_char": -0.8222720736549014, "num_chars": 21}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1123, "native_id": "Mercury_SC_400582", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 11.048057556152344, "incorrect_loss_raw": 9.60175641377767, "correct_loss_per_char": 0.9206714630126953, "incorrect_loss_per_char": 0.7281611684768919, "correct_loss_per_token": 3.6826858520507812, "incorrect_loss_per_token": 3.6542920006646056, "correct_loss_uncond": -9.076070785522461, "incorrect_loss_uncond": -7.959646860758464}, "model_output": [{"sum_logits": -11.048057556152344, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -20.124128341674805, "logits_per_token": -3.6826858520507812, "logits_per_char": -0.9206714630126953, "num_chars": 12}, {"sum_logits": -8.166717529296875, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -14.986677169799805, "logits_per_token": -4.0833587646484375, "logits_per_char": -0.5833369663783482, "num_chars": 14}, {"sum_logits": -10.664999008178711, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -18.672008514404297, "logits_per_token": -3.554999669392904, "logits_per_char": -0.888749917348226, "num_chars": 12}, {"sum_logits": -9.973552703857422, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -19.025524139404297, "logits_per_token": -3.324517567952474, "logits_per_char": -0.7123966217041016, "num_chars": 14}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1124, "native_id": "Mercury_SC_401168", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 3.9924848079681396, "incorrect_loss_raw": 7.217017491658528, "correct_loss_per_char": 0.49906060099601746, "incorrect_loss_per_char": 1.0781963931189644, "correct_loss_per_token": 1.9962424039840698, "incorrect_loss_per_token": 3.608508745829264, "correct_loss_uncond": -11.473034143447876, "incorrect_loss_uncond": -8.93500804901123}, "model_output": [{"sum_logits": -6.919950008392334, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.40010643005371, "logits_per_token": -3.459975004196167, "logits_per_char": -1.1533250013987224, "num_chars": 6}, {"sum_logits": -6.7305521965026855, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.877474784851074, "logits_per_token": -3.3652760982513428, "logits_per_char": -0.7478391329447428, "num_chars": 9}, {"sum_logits": -3.9924848079681396, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.465518951416016, "logits_per_token": -1.9962424039840698, "logits_per_char": -0.49906060099601746, "num_chars": 8}, {"sum_logits": -8.000550270080566, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -16.178495407104492, "logits_per_token": -4.000275135040283, "logits_per_char": -1.3334250450134277, "num_chars": 6}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1125, "native_id": "Mercury_180828", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 9.45998764038086, "incorrect_loss_raw": 12.99808152516683, "correct_loss_per_char": 0.27028536115373886, "incorrect_loss_per_char": 0.4486819919357952, "correct_loss_per_token": 1.5766646067301433, "incorrect_loss_per_token": 2.3460080888536243, "correct_loss_uncond": -24.36960220336914, "incorrect_loss_uncond": -17.155917167663574}, "model_output": [{"sum_logits": -6.949915885925293, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -25.510650634765625, "logits_per_token": -1.3899831771850586, "logits_per_char": -0.2673044571509728, "num_chars": 26}, {"sum_logits": -9.219589233398438, "num_tokens": 5, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -30.90704917907715, "logits_per_token": -1.8439178466796875, "logits_per_char": -0.3179168701171875, "num_chars": 29}, {"sum_logits": -9.45998764038086, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -33.82958984375, "logits_per_token": -1.5766646067301433, "logits_per_char": -0.27028536115373886, "num_chars": 35}, {"sum_logits": -22.824739456176758, "num_tokens": 6, "num_tokens_all": 197, "is_greedy": false, "sum_logits_uncond": -34.04429626464844, "logits_per_token": -3.8041232426961265, "logits_per_char": -0.7608246485392253, "num_chars": 30}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1126, "native_id": "FCAT_2008_5_1", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 10.543874740600586, "incorrect_loss_raw": 12.043609937032064, "correct_loss_per_char": 0.3514624913533529, "incorrect_loss_per_char": 0.3339848941563437, "correct_loss_per_token": 1.7573124567667644, "incorrect_loss_per_token": 1.7150832168639651, "correct_loss_uncond": -20.896921157836914, "incorrect_loss_uncond": -23.086052894592285}, "model_output": [{"sum_logits": -10.543874740600586, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -31.4407958984375, "logits_per_token": -1.7573124567667644, "logits_per_char": -0.3514624913533529, "num_chars": 30}, {"sum_logits": -12.651522636413574, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -35.67729949951172, "logits_per_token": -1.8073603766305106, "logits_per_char": -0.37210360695334044, "num_chars": 34}, {"sum_logits": -9.67142105102539, "num_tokens": 6, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -30.648677825927734, "logits_per_token": -1.6119035085042317, "logits_per_char": -0.2930733651825876, "num_chars": 33}, {"sum_logits": -13.807886123657227, "num_tokens": 8, "num_tokens_all": 205, "is_greedy": false, "sum_logits_uncond": -39.063011169433594, "logits_per_token": -1.7259857654571533, "logits_per_char": -0.3367777103331031, "num_chars": 41}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1127, "native_id": "TAKS_2009_5_25", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 25.849985122680664, "incorrect_loss_raw": 17.071590741475422, "correct_loss_per_char": 0.6304874420166016, "incorrect_loss_per_char": 0.6000589014234996, "correct_loss_per_token": 3.692855017525809, "incorrect_loss_per_token": 2.850981032659137, "correct_loss_uncond": -14.900152206420898, "incorrect_loss_uncond": -13.606913566589355}, "model_output": [{"sum_logits": -12.668791770935059, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -30.157655715942383, "logits_per_token": -2.1114652951558432, "logits_per_char": -0.45245684896196636, "num_chars": 28}, {"sum_logits": -22.185070037841797, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -35.81839370727539, "logits_per_token": -3.1692957196916853, "logits_per_char": -0.6932834386825562, "num_chars": 32}, {"sum_logits": -25.849985122680664, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -40.75013732910156, "logits_per_token": -3.692855017525809, "logits_per_char": -0.6304874420166016, "num_chars": 41}, {"sum_logits": -16.360910415649414, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -26.059463500976562, "logits_per_token": -3.272182083129883, "logits_per_char": -0.6544364166259765, "num_chars": 25}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1128, "native_id": "Mercury_SC_LBS10392", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 26.898719787597656, "incorrect_loss_raw": 20.307287216186523, "correct_loss_per_char": 0.6404457092285156, "incorrect_loss_per_char": 0.6274366589616572, "correct_loss_per_token": 2.9887466430664062, "incorrect_loss_per_token": 3.3568783159609197, "correct_loss_uncond": -12.032463073730469, "incorrect_loss_uncond": -9.29306411743164}, "model_output": [{"sum_logits": -26.898719787597656, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -38.931182861328125, "logits_per_token": -2.9887466430664062, "logits_per_char": -0.6404457092285156, "num_chars": 42}, {"sum_logits": -27.142385482788086, "num_tokens": 9, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -39.568626403808594, "logits_per_token": -3.015820609198676, "logits_per_char": -0.6312182670415833, "num_chars": 43}, {"sum_logits": -16.68065643310547, "num_tokens": 6, "num_tokens_all": 188, "is_greedy": false, "sum_logits_uncond": -26.461257934570312, "logits_per_token": -2.780109405517578, "logits_per_char": -0.6178020901150174, "num_chars": 27}, {"sum_logits": -17.098819732666016, "num_tokens": 4, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -22.771169662475586, "logits_per_token": -4.274704933166504, "logits_per_char": -0.633289619728371, "num_chars": 27}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1129, "native_id": "Mercury_7212905", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.95964241027832, "incorrect_loss_raw": 23.238983154296875, "correct_loss_per_char": 0.48940475314271215, "incorrect_loss_per_char": 0.4653942758065683, "correct_loss_per_token": 3.11995530128479, "incorrect_loss_per_token": 2.4845104458356144, "correct_loss_uncond": -9.288286209106445, "incorrect_loss_uncond": -12.40792465209961}, "model_output": [{"sum_logits": -25.328641891479492, "num_tokens": 9, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -34.86460494995117, "logits_per_token": -2.814293543497721, "logits_per_char": -0.4690489239162869, "num_chars": 54}, {"sum_logits": -24.95964241027832, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -34.247928619384766, "logits_per_token": -3.11995530128479, "logits_per_char": -0.48940475314271215, "num_chars": 51}, {"sum_logits": -17.71548843383789, "num_tokens": 8, "num_tokens_all": 222, "is_greedy": false, "sum_logits_uncond": -30.624523162841797, "logits_per_token": -2.2144360542297363, "logits_per_char": -0.3936775207519531, "num_chars": 45}, {"sum_logits": -26.672819137573242, "num_tokens": 11, "num_tokens_all": 225, "is_greedy": false, "sum_logits_uncond": -41.451595306396484, "logits_per_token": -2.4248017397793857, "logits_per_char": -0.5334563827514649, "num_chars": 50}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1130, "native_id": "Mercury_7212888", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.694290161132812, "incorrect_loss_raw": 12.099784851074219, "correct_loss_per_char": 0.3966965675354004, "incorrect_loss_per_char": 0.40622258434548847, "correct_loss_per_token": 3.173572540283203, "incorrect_loss_per_token": 2.7798521041870115, "correct_loss_uncond": -13.504131317138672, "incorrect_loss_uncond": -14.484435399373373}, "model_output": [{"sum_logits": -12.694290161132812, "num_tokens": 4, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -26.198421478271484, "logits_per_token": -3.173572540283203, "logits_per_char": -0.3966965675354004, "num_chars": 32}, {"sum_logits": -8.49742317199707, "num_tokens": 4, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -24.70763397216797, "logits_per_token": -2.1243557929992676, "logits_per_char": -0.3147193767406322, "num_chars": 27}, {"sum_logits": -14.705646514892578, "num_tokens": 5, "num_tokens_all": 211, "is_greedy": false, "sum_logits_uncond": -27.787763595581055, "logits_per_token": -2.9411293029785157, "logits_per_char": -0.5070912591342268, "num_chars": 29}, {"sum_logits": -13.096284866333008, "num_tokens": 4, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -27.25726318359375, "logits_per_token": -3.274071216583252, "logits_per_char": -0.3968571171616063, "num_chars": 33}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1131, "native_id": "MDSA_2007_8_42", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 32.151580810546875, "incorrect_loss_raw": 28.20349629720052, "correct_loss_per_char": 0.4528391663457306, "incorrect_loss_per_char": 0.599471222012303, "correct_loss_per_token": 2.4731985238882213, "incorrect_loss_per_token": 3.0082890767899766, "correct_loss_uncond": -18.381126403808594, "incorrect_loss_uncond": -13.070700327555338}, "model_output": [{"sum_logits": -26.10202407836914, "num_tokens": 7, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -33.33995819091797, "logits_per_token": -3.7288605826241628, "logits_per_char": -0.7677065905402688, "num_chars": 34}, {"sum_logits": -25.218074798583984, "num_tokens": 10, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -42.195465087890625, "logits_per_token": -2.5218074798583983, "logits_per_char": -0.48496297689584583, "num_chars": 52}, {"sum_logits": -33.29039001464844, "num_tokens": 12, "num_tokens_all": 209, "is_greedy": false, "sum_logits_uncond": -48.287166595458984, "logits_per_token": -2.7741991678873696, "logits_per_char": -0.5457440986007941, "num_chars": 61}, {"sum_logits": -32.151580810546875, "num_tokens": 13, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -50.53270721435547, "logits_per_token": -2.4731985238882213, "logits_per_char": -0.4528391663457306, "num_chars": 71}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1132, "native_id": "Mercury_SC_415534", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 4.972237586975098, "incorrect_loss_raw": 7.805889129638672, "correct_loss_per_char": 1.2430593967437744, "incorrect_loss_per_char": 1.366812409294976, "correct_loss_per_token": 1.6574125289916992, "incorrect_loss_per_token": 2.4090996053483753, "correct_loss_uncond": -9.033459663391113, "incorrect_loss_uncond": -8.682095845540365}, "model_output": [{"sum_logits": -6.943083763122559, "num_tokens": 4, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -17.035608291625977, "logits_per_token": -1.7357709407806396, "logits_per_char": -1.157180627187093, "num_chars": 6}, {"sum_logits": -4.972237586975098, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -14.005697250366211, "logits_per_token": -1.6574125289916992, "logits_per_char": -1.2430593967437744, "num_chars": 4}, {"sum_logits": -5.924779891967773, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -16.810949325561523, "logits_per_token": -1.9749266306559246, "logits_per_char": -1.1849559783935546, "num_chars": 5}, {"sum_logits": -10.549803733825684, "num_tokens": 3, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -15.61739730834961, "logits_per_token": -3.516601244608561, "logits_per_char": -1.7583006223042805, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1133, "native_id": "Mercury_7213413", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 19.170761108398438, "incorrect_loss_raw": 14.412439982096354, "correct_loss_per_char": 0.39124002262037627, "incorrect_loss_per_char": 0.32082631797845573, "correct_loss_per_token": 2.1300845675998263, "incorrect_loss_per_token": 1.6651022444555768, "correct_loss_uncond": -17.546043395996094, "incorrect_loss_uncond": -17.470354080200195}, "model_output": [{"sum_logits": -22.74297523498535, "num_tokens": 11, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -38.63800048828125, "logits_per_token": -2.0675432031804863, "logits_per_char": -0.4135086406360973, "num_chars": 55}, {"sum_logits": -9.604108810424805, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -26.149585723876953, "logits_per_token": -1.3720155443464006, "logits_per_char": -0.22866925739106678, "num_chars": 42}, {"sum_logits": -10.890235900878906, "num_tokens": 7, "num_tokens_all": 210, "is_greedy": false, "sum_logits_uncond": -30.860795974731445, "logits_per_token": -1.5557479858398438, "logits_per_char": -0.3203010559082031, "num_chars": 34}, {"sum_logits": -19.170761108398438, "num_tokens": 9, "num_tokens_all": 212, "is_greedy": false, "sum_logits_uncond": -36.71680450439453, "logits_per_token": -2.1300845675998263, "logits_per_char": -0.39124002262037627, "num_chars": 49}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1134, "native_id": "Mercury_7068635", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 5.080506324768066, "incorrect_loss_raw": 5.3518721262613935, "correct_loss_per_char": 1.0161012649536132, "incorrect_loss_per_char": 1.2686173677444457, "correct_loss_per_token": 5.080506324768066, "incorrect_loss_per_token": 5.3518721262613935, "correct_loss_uncond": -6.633879661560059, "incorrect_loss_uncond": -6.449424743652344}, "model_output": [{"sum_logits": -2.865172863006592, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -11.566265106201172, "logits_per_token": -2.865172863006592, "logits_per_char": -0.716293215751648, "num_chars": 4}, {"sum_logits": -9.029403686523438, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -12.094873428344727, "logits_per_token": -9.029403686523438, "logits_per_char": -2.2573509216308594, "num_chars": 4}, {"sum_logits": -5.080506324768066, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -11.714385986328125, "logits_per_token": -5.080506324768066, "logits_per_char": -1.0161012649536132, "num_chars": 5}, {"sum_logits": -4.16103982925415, "num_tokens": 1, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -11.742752075195312, "logits_per_token": -4.16103982925415, "logits_per_char": -0.8322079658508301, "num_chars": 5}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1135, "native_id": "Mercury_417137", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 13.82275104522705, "incorrect_loss_raw": 10.100007057189941, "correct_loss_per_char": 1.382275104522705, "incorrect_loss_per_char": 1.2844043093383626, "correct_loss_per_token": 4.607583681742351, "incorrect_loss_per_token": 8.767249902089437, "correct_loss_uncond": -3.314425468444824, "incorrect_loss_uncond": -4.299421310424805}, "model_output": [{"sum_logits": -9.899309158325195, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -13.479253768920898, "logits_per_token": -9.899309158325195, "logits_per_char": -1.414187022617885, "num_chars": 7}, {"sum_logits": -12.404169082641602, "num_tokens": 1, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -15.226165771484375, "logits_per_token": -12.404169082641602, "logits_per_char": -1.5505211353302002, "num_chars": 8}, {"sum_logits": -13.82275104522705, "num_tokens": 3, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -17.137176513671875, "logits_per_token": -4.607583681742351, "logits_per_char": -1.382275104522705, "num_chars": 10}, {"sum_logits": -7.996542930603027, "num_tokens": 2, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -14.492865562438965, "logits_per_token": -3.9982714653015137, "logits_per_char": -0.888504770067003, "num_chars": 9}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1136, "native_id": "Mercury_7268258", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 3.4668116569519043, "incorrect_loss_raw": 2.5502691666285195, "correct_loss_per_char": 0.49525880813598633, "incorrect_loss_per_char": 0.41024677147940986, "correct_loss_per_token": 3.4668116569519043, "incorrect_loss_per_token": 2.5502691666285195, "correct_loss_uncond": -8.97847032546997, "incorrect_loss_uncond": -8.893911004066467}, "model_output": [{"sum_logits": -3.1655635833740234, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -12.06695556640625, "logits_per_token": -3.1655635833740234, "logits_per_char": -0.5275939305623373, "num_chars": 6}, {"sum_logits": -2.6206846237182617, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -9.888994216918945, "logits_per_token": -2.6206846237182617, "logits_per_char": -0.43678077061971027, "num_chars": 6}, {"sum_logits": -3.4668116569519043, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -12.445281982421875, "logits_per_token": -3.4668116569519043, "logits_per_char": -0.49525880813598633, "num_chars": 7}, {"sum_logits": -1.864559292793274, "num_tokens": 1, "num_tokens_all": 176, "is_greedy": true, "sum_logits_uncond": -12.376590728759766, "logits_per_token": -1.864559292793274, "logits_per_char": -0.26636561325618197, "num_chars": 7}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1137, "native_id": "NAEP_2005_4_S13+14", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 24.42173194885254, "incorrect_loss_raw": 21.399930000305176, "correct_loss_per_char": 0.38764653887067524, "incorrect_loss_per_char": 0.39432872872897956, "correct_loss_per_token": 1.628115463256836, "incorrect_loss_per_token": 1.839407270604914, "correct_loss_uncond": -19.4333553314209, "incorrect_loss_uncond": -16.414659182230633}, "model_output": [{"sum_logits": -15.974335670471191, "num_tokens": 10, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -32.75582504272461, "logits_per_token": -1.5974335670471191, "logits_per_char": -0.35498523712158203, "num_chars": 45}, {"sum_logits": -24.44046401977539, "num_tokens": 11, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -40.72406768798828, "logits_per_token": -2.2218603654341265, "logits_per_char": -0.4443720730868253, "num_chars": 55}, {"sum_logits": -24.42173194885254, "num_tokens": 15, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -43.85508728027344, "logits_per_token": -1.628115463256836, "logits_per_char": -0.38764653887067524, "num_chars": 63}, {"sum_logits": -23.784990310668945, "num_tokens": 14, "num_tokens_all": 218, "is_greedy": false, "sum_logits_uncond": -39.96387481689453, "logits_per_token": -1.698927879333496, "logits_per_char": -0.3836288759785314, "num_chars": 62}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1138, "native_id": "Mercury_SC_406089", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 27.22121238708496, "incorrect_loss_raw": 33.05179087320963, "correct_loss_per_char": 0.4536868731180827, "incorrect_loss_per_char": 0.6654783722417575, "correct_loss_per_token": 2.4746556715531782, "incorrect_loss_per_token": 4.045456863584973, "correct_loss_uncond": -21.408777236938477, "incorrect_loss_uncond": -10.73489507039388}, "model_output": [{"sum_logits": -27.57977294921875, "num_tokens": 7, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -39.0371208190918, "logits_per_token": -3.939967564174107, "logits_per_char": -0.6894943237304687, "num_chars": 40}, {"sum_logits": -41.5537223815918, "num_tokens": 8, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -49.9385986328125, "logits_per_token": -5.194215297698975, "logits_per_char": -0.8147788702272901, "num_chars": 51}, {"sum_logits": -30.02187728881836, "num_tokens": 10, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -42.38433837890625, "logits_per_token": -3.002187728881836, "logits_per_char": -0.4921619227675141, "num_chars": 61}, {"sum_logits": -27.22121238708496, "num_tokens": 11, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -48.62998962402344, "logits_per_token": -2.4746556715531782, "logits_per_char": -0.4536868731180827, "num_chars": 60}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1139, "native_id": "Mercury_SC_400700", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 15.468183517456055, "incorrect_loss_raw": 11.775789260864258, "correct_loss_per_char": 0.4070574609856856, "incorrect_loss_per_char": 0.3510398555446315, "correct_loss_per_token": 1.9335229396820068, "incorrect_loss_per_token": 1.4908616057148685, "correct_loss_uncond": -16.352670669555664, "incorrect_loss_uncond": -11.438717524210611}, "model_output": [{"sum_logits": -7.579283714294434, "num_tokens": 6, "num_tokens_all": 189, "is_greedy": false, "sum_logits_uncond": -20.999343872070312, "logits_per_token": -1.2632139523824055, "logits_per_char": -0.30317134857177735, "num_chars": 25}, {"sum_logits": -18.65805435180664, "num_tokens": 9, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -26.11164093017578, "logits_per_token": -2.073117150200738, "logits_per_char": -0.5042717392380173, "num_chars": 37}, {"sum_logits": -15.468183517456055, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -31.82085418701172, "logits_per_token": -1.9335229396820068, "logits_per_char": -0.4070574609856856, "num_chars": 38}, {"sum_logits": -9.0900297164917, "num_tokens": 8, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -22.532535552978516, "logits_per_token": -1.1362537145614624, "logits_per_char": -0.2456764788241, "num_chars": 37}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1140, "native_id": "Mercury_7223493", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 16.535234451293945, "incorrect_loss_raw": 17.738970438639324, "correct_loss_per_char": 0.5511744817097982, "incorrect_loss_per_char": 0.6430451437808036, "correct_loss_per_token": 2.7558724085489907, "incorrect_loss_per_token": 3.020411072836982, "correct_loss_uncond": -7.072229385375977, "incorrect_loss_uncond": -12.251101811726889}, "model_output": [{"sum_logits": -16.535234451293945, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -23.607463836669922, "logits_per_token": -2.7558724085489907, "logits_per_char": -0.5511744817097982, "num_chars": 30}, {"sum_logits": -10.390691757202148, "num_tokens": 8, "num_tokens_all": 217, "is_greedy": false, "sum_logits_uncond": -34.1429443359375, "logits_per_token": -1.2988364696502686, "logits_per_char": -0.2597672939300537, "num_chars": 40}, {"sum_logits": -24.08541488647461, "num_tokens": 6, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -29.085195541381836, "logits_per_token": -4.014235814412435, "logits_per_char": -0.7769488673056325, "num_chars": 31}, {"sum_logits": -18.74080467224121, "num_tokens": 5, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -26.742076873779297, "logits_per_token": -3.7481609344482423, "logits_per_char": -0.8924192701067243, "num_chars": 21}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1141, "native_id": "Mercury_SC_405928", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.63876724243164, "incorrect_loss_raw": 23.486679077148438, "correct_loss_per_char": 0.3415883038495038, "incorrect_loss_per_char": 0.5971529124610316, "correct_loss_per_token": 1.579845905303955, "incorrect_loss_per_token": 3.362912026662675, "correct_loss_uncond": -19.19140625, "incorrect_loss_uncond": -13.331343332926432}, "model_output": [{"sum_logits": -23.210182189941406, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -35.6402702331543, "logits_per_token": -2.901272773742676, "logits_per_char": -0.5275041406804865, "num_chars": 44}, {"sum_logits": -18.37432861328125, "num_tokens": 6, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -33.085853576660156, "logits_per_token": -3.0623881022135415, "logits_per_char": -0.4835349635074013, "num_chars": 38}, {"sum_logits": -28.875526428222656, "num_tokens": 7, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -41.727943420410156, "logits_per_token": -4.125075204031808, "logits_per_char": -0.7804196331952069, "num_chars": 37}, {"sum_logits": -12.63876724243164, "num_tokens": 8, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -31.83017349243164, "logits_per_token": -1.579845905303955, "logits_per_char": -0.3415883038495038, "num_chars": 37}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1142, "native_id": "MCAS_2009_5_6518", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 16.082717895507812, "incorrect_loss_raw": 17.086223284403484, "correct_loss_per_char": 0.5545764791554418, "incorrect_loss_per_char": 0.626827239147021, "correct_loss_per_token": 2.2975311279296875, "incorrect_loss_per_token": 2.440889040629069, "correct_loss_uncond": -12.876466751098633, "incorrect_loss_uncond": -12.14548397064209}, "model_output": [{"sum_logits": -18.136302947998047, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -31.02060317993164, "logits_per_token": -2.590900421142578, "logits_per_char": -0.6975501133845403, "num_chars": 26}, {"sum_logits": -15.96576976776123, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.451751708984375, "logits_per_token": -2.2808242525373186, "logits_per_char": -0.5913248062133789, "num_chars": 27}, {"sum_logits": -17.156597137451172, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.222766876220703, "logits_per_token": -2.4509424482073103, "logits_per_char": -0.5916067978431439, "num_chars": 29}, {"sum_logits": -16.082717895507812, "num_tokens": 7, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -28.959184646606445, "logits_per_token": -2.2975311279296875, "logits_per_char": -0.5545764791554418, "num_chars": 29}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1143, "native_id": "MCAS_2006_9_1", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 7.95509147644043, "incorrect_loss_raw": 22.85133934020996, "correct_loss_per_char": 0.14205520493643625, "incorrect_loss_per_char": 0.40356356041078517, "correct_loss_per_token": 0.6629242897033691, "incorrect_loss_per_token": 1.6930588255712997, "correct_loss_uncond": -31.797300338745117, "incorrect_loss_uncond": -26.148940404256184}, "model_output": [{"sum_logits": -17.092899322509766, "num_tokens": 11, "num_tokens_all": 200, "is_greedy": false, "sum_logits_uncond": -39.59830093383789, "logits_per_token": -1.5538999384099788, "logits_per_char": -0.3636787089895695, "num_chars": 47}, {"sum_logits": -7.95509147644043, "num_tokens": 12, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -39.75239181518555, "logits_per_token": -0.6629242897033691, "logits_per_char": -0.14205520493643625, "num_chars": 56}, {"sum_logits": -31.608707427978516, "num_tokens": 15, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -56.711082458496094, "logits_per_token": -2.107247161865234, "logits_per_char": -0.5268117904663085, "num_chars": 60}, {"sum_logits": -19.8524112701416, "num_tokens": 14, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -50.69145584106445, "logits_per_token": -1.4180293764386858, "logits_per_char": -0.32020018177647747, "num_chars": 62}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1144, "native_id": "Mercury_7239383", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 32.41949462890625, "incorrect_loss_raw": 22.19468053181966, "correct_loss_per_char": 0.648389892578125, "incorrect_loss_per_char": 0.5067750594153597, "correct_loss_per_token": 3.6021660698784723, "incorrect_loss_per_token": 2.9388021892971463, "correct_loss_uncond": -11.564613342285156, "incorrect_loss_uncond": -13.876021067301432}, "model_output": [{"sum_logits": -19.533815383911133, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.322574615478516, "logits_per_token": -3.9067630767822266, "logits_per_char": -0.6301230769003591, "num_chars": 31}, {"sum_logits": -32.41949462890625, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -43.984107971191406, "logits_per_token": -3.6021660698784723, "logits_per_char": -0.648389892578125, "num_chars": 50}, {"sum_logits": -18.415878295898438, "num_tokens": 9, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -34.6004638671875, "logits_per_token": -2.046208699544271, "logits_per_char": -0.3175151430327317, "num_chars": 58}, {"sum_logits": -28.634347915649414, "num_tokens": 10, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -42.289066314697266, "logits_per_token": -2.8634347915649414, "logits_per_char": -0.5726869583129883, "num_chars": 50}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1145, "native_id": "Mercury_SC_400130", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 14.532594680786133, "incorrect_loss_raw": 7.854112545649211, "correct_loss_per_char": 0.9688396453857422, "incorrect_loss_per_char": 0.6637219835210729, "correct_loss_per_token": 4.844198226928711, "incorrect_loss_per_token": 3.0951091580920753, "correct_loss_uncond": -7.425678253173828, "incorrect_loss_uncond": -10.265467087427774}, "model_output": [{"sum_logits": -14.975048065185547, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -25.118331909179688, "logits_per_token": -4.991682688395183, "logits_per_char": -0.7487524032592774, "num_chars": 20}, {"sum_logits": -14.532594680786133, "num_tokens": 3, "num_tokens_all": 177, "is_greedy": false, "sum_logits_uncond": -21.95827293395996, "logits_per_token": -4.844198226928711, "logits_per_char": -0.9688396453857422, "num_chars": 15}, {"sum_logits": -3.3984248638153076, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -14.055558204650879, "logits_per_token": -1.6992124319076538, "logits_per_char": -0.3776027626461453, "num_chars": 9}, {"sum_logits": -5.188864707946777, "num_tokens": 2, "num_tokens_all": 176, "is_greedy": false, "sum_logits_uncond": -15.18484878540039, "logits_per_token": -2.5944323539733887, "logits_per_char": -0.8648107846577963, "num_chars": 6}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1146, "native_id": "Mercury_401426", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 2, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 24.512161254882812, "incorrect_loss_raw": 20.965735117594402, "correct_loss_per_char": 0.532873070758322, "incorrect_loss_per_char": 0.5622985370616852, "correct_loss_per_token": 2.723573472764757, "incorrect_loss_per_token": 3.1524371646699447, "correct_loss_uncond": -10.79903793334961, "incorrect_loss_uncond": -11.38252321879069}, "model_output": [{"sum_logits": -21.853191375732422, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -35.66796875, "logits_per_token": -3.1218844822474887, "logits_per_char": -0.6427409228156594, "num_chars": 34}, {"sum_logits": -19.823850631713867, "num_tokens": 6, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -27.881803512573242, "logits_per_token": -3.3039751052856445, "logits_per_char": -0.5506625175476074, "num_chars": 36}, {"sum_logits": -21.220163345336914, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -33.49500274658203, "logits_per_token": -3.031451906476702, "logits_per_char": -0.4934921708217887, "num_chars": 43}, {"sum_logits": -24.512161254882812, "num_tokens": 9, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -35.31119918823242, "logits_per_token": -2.723573472764757, "logits_per_char": -0.532873070758322, "num_chars": 46}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1147, "native_id": "MCAS_2010_8_12016", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 41.019813537597656, "incorrect_loss_raw": 28.25287119547526, "correct_loss_per_char": 0.8371390517877073, "incorrect_loss_per_char": 0.6672671021580978, "correct_loss_per_token": 4.101981353759766, "incorrect_loss_per_token": 3.0031269709269206, "correct_loss_uncond": -6.150596618652344, "incorrect_loss_uncond": -10.015881856282553}, "model_output": [{"sum_logits": -21.340782165527344, "num_tokens": 8, "num_tokens_all": 221, "is_greedy": false, "sum_logits_uncond": -33.33760070800781, "logits_per_token": -2.667597770690918, "logits_per_char": -0.6668994426727295, "num_chars": 32}, {"sum_logits": -30.900527954101562, "num_tokens": 10, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -42.53913116455078, "logits_per_token": -3.0900527954101564, "logits_per_char": -0.657458041576629, "num_chars": 47}, {"sum_logits": -41.019813537597656, "num_tokens": 10, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -47.17041015625, "logits_per_token": -4.101981353759766, "logits_per_char": -0.8371390517877073, "num_chars": 49}, {"sum_logits": -32.517303466796875, "num_tokens": 10, "num_tokens_all": 223, "is_greedy": false, "sum_logits_uncond": -38.929527282714844, "logits_per_token": -3.2517303466796874, "logits_per_char": -0.6774438222249349, "num_chars": 48}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1148, "native_id": "Mercury_SC_400324", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 17.4812068939209, "incorrect_loss_raw": 23.689457575480144, "correct_loss_per_char": 0.48558908038669163, "incorrect_loss_per_char": 0.6738581889565172, "correct_loss_per_token": 1.9423563215467665, "incorrect_loss_per_token": 3.1946802088823265, "correct_loss_uncond": -14.990434646606445, "incorrect_loss_uncond": -11.249856313069662}, "model_output": [{"sum_logits": -15.787687301635742, "num_tokens": 7, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -31.882762908935547, "logits_per_token": -2.2553839002336775, "logits_per_char": -0.4784147667162346, "num_chars": 33}, {"sum_logits": -17.4812068939209, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -32.471641540527344, "logits_per_token": -1.9423563215467665, "logits_per_char": -0.48558908038669163, "num_chars": 36}, {"sum_logits": -33.92623519897461, "num_tokens": 9, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -47.929527282714844, "logits_per_token": -3.7695816887749567, "logits_per_char": -0.7218347914675449, "num_chars": 47}, {"sum_logits": -21.354450225830078, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -25.005651473999023, "logits_per_token": -3.559075037638346, "logits_per_char": -0.8213250086857722, "num_chars": 26}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1149, "native_id": "Mercury_SC_LBS10662", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 1.634781837463379, "incorrect_loss_raw": 5.691454569498698, "correct_loss_per_char": 0.1362318197886149, "incorrect_loss_per_char": 0.5468351516782913, "correct_loss_per_token": 0.8173909187316895, "incorrect_loss_per_token": 2.845727284749349, "correct_loss_uncond": -17.161288261413574, "incorrect_loss_uncond": -10.17986265818278}, "model_output": [{"sum_logits": -8.083614349365234, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -16.07807731628418, "logits_per_token": -4.041807174682617, "logits_per_char": -0.8981793721516927, "num_chars": 9}, {"sum_logits": -5.363555908203125, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -16.3914794921875, "logits_per_token": -2.6817779541015625, "logits_per_char": -0.4125812237079327, "num_chars": 13}, {"sum_logits": -1.634781837463379, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -18.796070098876953, "logits_per_token": -0.8173909187316895, "logits_per_char": -0.1362318197886149, "num_chars": 12}, {"sum_logits": -3.6271934509277344, "num_tokens": 2, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -15.144394874572754, "logits_per_token": -1.8135967254638672, "logits_per_char": -0.3297448591752486, "num_chars": 11}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1150, "native_id": "VASoL_2009_3_8", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.873221397399902, "incorrect_loss_raw": 10.395643552144369, "correct_loss_per_char": 2.2183053493499756, "incorrect_loss_per_char": 1.9897374259101017, "correct_loss_per_token": 8.873221397399902, "incorrect_loss_per_token": 8.37135648727417, "correct_loss_uncond": 0.12125396728515625, "incorrect_loss_uncond": -1.2802775700887044}, "model_output": [{"sum_logits": -8.045215606689453, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -11.912313461303711, "logits_per_token": -8.045215606689453, "logits_per_char": -1.3408692677815754, "num_chars": 6}, {"sum_logits": -10.995992660522461, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -11.24551010131836, "logits_per_token": -10.995992660522461, "logits_per_char": -2.199198532104492, "num_chars": 5}, {"sum_logits": -12.145722389221191, "num_tokens": 2, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -11.869939804077148, "logits_per_token": -6.072861194610596, "logits_per_char": -2.4291444778442384, "num_chars": 5}, {"sum_logits": -8.873221397399902, "num_tokens": 1, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -8.751967430114746, "logits_per_token": -8.873221397399902, "logits_per_char": -2.2183053493499756, "num_chars": 4}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1151, "native_id": "Mercury_SC_401185", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 2, "predicted_index_uncond": 1, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 23.336849212646484, "incorrect_loss_raw": 24.340927124023438, "correct_loss_per_char": 0.4667369842529297, "incorrect_loss_per_char": 0.5448069937435197, "correct_loss_per_token": 3.3338356018066406, "incorrect_loss_per_token": 3.0004952789901136, "correct_loss_uncond": -10.484779357910156, "incorrect_loss_uncond": -11.584063847859701}, "model_output": [{"sum_logits": -17.744367599487305, "num_tokens": 6, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -29.7958927154541, "logits_per_token": -2.957394599914551, "logits_per_char": -0.506981931413923, "num_chars": 35}, {"sum_logits": -19.61153221130371, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -36.601417541503906, "logits_per_token": -2.801647458757673, "logits_per_char": -0.4669412431262788, "num_chars": 42}, {"sum_logits": -23.336849212646484, "num_tokens": 7, "num_tokens_all": 215, "is_greedy": false, "sum_logits_uncond": -33.82162857055664, "logits_per_token": -3.3338356018066406, "logits_per_char": -0.4667369842529297, "num_chars": 50}, {"sum_logits": -35.6668815612793, "num_tokens": 11, "num_tokens_all": 219, "is_greedy": false, "sum_logits_uncond": -41.377662658691406, "logits_per_token": -3.2424437782981177, "logits_per_char": -0.6604978066903574, "num_chars": 54}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1152, "native_id": "NYSEDREGENTS_2015_8_29", "metrics": {"predicted_index_raw": 2, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 11.591854095458984, "incorrect_loss_raw": 16.880716959635418, "correct_loss_per_char": 0.6439918941921658, "incorrect_loss_per_char": 1.1333455089538818, "correct_loss_per_token": 3.8639513651529946, "incorrect_loss_per_token": 6.032649093204075, "correct_loss_uncond": -9.660097122192383, "incorrect_loss_uncond": -7.510458628336589}, "model_output": [{"sum_logits": -13.847238540649414, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -22.752574920654297, "logits_per_token": -4.615746180216472, "logits_per_char": -0.8654524087905884, "num_chars": 16}, {"sum_logits": -17.133892059326172, "num_tokens": 2, "num_tokens_all": 180, "is_greedy": false, "sum_logits_uncond": -21.489925384521484, "logits_per_token": -8.566946029663086, "logits_per_char": -1.2238494328090124, "num_chars": 14}, {"sum_logits": -11.591854095458984, "num_tokens": 3, "num_tokens_all": 181, "is_greedy": false, "sum_logits_uncond": -21.251951217651367, "logits_per_token": -3.8639513651529946, "logits_per_char": -0.6439918941921658, "num_chars": 18}, {"sum_logits": -19.661020278930664, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -28.931026458740234, "logits_per_token": -4.915255069732666, "logits_per_char": -1.3107346852620443, "num_chars": 15}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1153, "native_id": "Mercury_7234378", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 16.908164978027344, "incorrect_loss_raw": 21.577816009521484, "correct_loss_per_char": 0.33153264662798715, "incorrect_loss_per_char": 0.3666261474867374, "correct_loss_per_token": 2.8180274963378906, "incorrect_loss_per_token": 2.3366775017280084, "correct_loss_uncond": -18.829238891601562, "incorrect_loss_uncond": -17.292540232340496}, "model_output": [{"sum_logits": -21.659591674804688, "num_tokens": 7, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -37.94856643676758, "logits_per_token": -3.0942273821149553, "logits_per_char": -0.4708606885827106, "num_chars": 46}, {"sum_logits": -22.329069137573242, "num_tokens": 11, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -39.868011474609375, "logits_per_token": -2.029915376143022, "logits_per_char": -0.3283686637878418, "num_chars": 68}, {"sum_logits": -20.744787216186523, "num_tokens": 11, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -38.794490814208984, "logits_per_token": -1.8858897469260476, "logits_per_char": -0.3006490900896598, "num_chars": 69}, {"sum_logits": -16.908164978027344, "num_tokens": 6, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -35.737403869628906, "logits_per_token": -2.8180274963378906, "logits_per_char": -0.33153264662798715, "num_chars": 51}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1154, "native_id": "ACTAAP_2014_7_3", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 22.16341781616211, "incorrect_loss_raw": 22.118831952412922, "correct_loss_per_char": 0.307825247446696, "incorrect_loss_per_char": 0.342816976609239, "correct_loss_per_token": 1.7048782935509315, "incorrect_loss_per_token": 1.6323155256418083, "correct_loss_uncond": -12.099994659423828, "incorrect_loss_uncond": -11.705276171366373}, "model_output": [{"sum_logits": -13.232794761657715, "num_tokens": 14, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -27.480022430419922, "logits_per_token": -0.9451996258326939, "logits_per_char": -0.2169310616665199, "num_chars": 61}, {"sum_logits": -24.513870239257812, "num_tokens": 14, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -35.27825164794922, "logits_per_token": -1.750990731375558, "logits_per_char": -0.4085645039876302, "num_chars": 60}, {"sum_logits": -28.609830856323242, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -38.71405029296875, "logits_per_token": -2.2007562197171726, "logits_per_char": -0.4029553641735668, "num_chars": 71}, {"sum_logits": -22.16341781616211, "num_tokens": 13, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -34.26341247558594, "logits_per_token": -1.7048782935509315, "logits_per_char": -0.307825247446696, "num_chars": 72}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1155, "native_id": "MDSA_2008_8_27", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 12.358762741088867, "incorrect_loss_raw": 10.663518269856771, "correct_loss_per_char": 0.32523059844970703, "incorrect_loss_per_char": 0.3424471135725055, "correct_loss_per_token": 1.7655375344412667, "incorrect_loss_per_token": 1.8992752801804318, "correct_loss_uncond": -17.850404739379883, "incorrect_loss_uncond": -17.857722600301106}, "model_output": [{"sum_logits": -9.924104690551758, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -27.95193099975586, "logits_per_token": -1.9848209381103517, "logits_per_char": -0.35443231037684847, "num_chars": 28}, {"sum_logits": -9.811460494995117, "num_tokens": 5, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -26.49016571044922, "logits_per_token": -1.9622920989990233, "logits_per_char": -0.3504093033926828, "num_chars": 28}, {"sum_logits": -12.254989624023438, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -31.121625900268555, "logits_per_token": -1.7507128034319197, "logits_per_char": -0.3224997269479852, "num_chars": 38}, {"sum_logits": -12.358762741088867, "num_tokens": 7, "num_tokens_all": 201, "is_greedy": false, "sum_logits_uncond": -30.20916748046875, "logits_per_token": -1.7655375344412667, "logits_per_char": -0.32523059844970703, "num_chars": 38}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1156, "native_id": "Mercury_7004725", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 13.356346130371094, "incorrect_loss_raw": 12.727697054545084, "correct_loss_per_char": 0.43084987517326107, "incorrect_loss_per_char": 0.4391464203414988, "correct_loss_per_token": 2.6712692260742186, "incorrect_loss_per_token": 2.2292816797892256, "correct_loss_uncond": -16.857192993164062, "incorrect_loss_uncond": -13.085155804951986}, "model_output": [{"sum_logits": -13.356346130371094, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -30.213539123535156, "logits_per_token": -2.6712692260742186, "logits_per_char": -0.43084987517326107, "num_chars": 31}, {"sum_logits": -16.603530883789062, "num_tokens": 7, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -28.134607315063477, "logits_per_token": -2.3719329833984375, "logits_per_char": -0.5355977704448085, "num_chars": 31}, {"sum_logits": -12.79006576538086, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -25.132259368896484, "logits_per_token": -2.558013153076172, "logits_per_char": -0.39968955516815186, "num_chars": 32}, {"sum_logits": -8.789494514465332, "num_tokens": 5, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -24.17169189453125, "logits_per_token": -1.7578989028930665, "logits_per_char": -0.3821519354115362, "num_chars": 23}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1157, "native_id": "Mercury_405143", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 3, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 14.151432037353516, "incorrect_loss_raw": 16.644865036010742, "correct_loss_per_char": 0.6432469107887961, "incorrect_loss_per_char": 0.815085395475372, "correct_loss_per_token": 2.830286407470703, "incorrect_loss_per_token": 3.8581408500671386, "correct_loss_uncond": -8.302392959594727, "incorrect_loss_uncond": -10.01402473449707}, "model_output": [{"sum_logits": -13.47800064086914, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -24.449859619140625, "logits_per_token": -3.369500160217285, "logits_per_char": -0.748777813381619, "num_chars": 18}, {"sum_logits": -18.184524536132812, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -25.723350524902344, "logits_per_token": -3.6369049072265627, "logits_per_char": -0.8659297398158482, "num_chars": 21}, {"sum_logits": -14.151432037353516, "num_tokens": 5, "num_tokens_all": 191, "is_greedy": false, "sum_logits_uncond": -22.453824996948242, "logits_per_token": -2.830286407470703, "logits_per_char": -0.6432469107887961, "num_chars": 22}, {"sum_logits": -18.272069931030273, "num_tokens": 4, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -29.80345916748047, "logits_per_token": -4.568017482757568, "logits_per_char": -0.8305486332286488, "num_chars": 22}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1158, "native_id": "MCAS_2003_8_7", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.5941972732543945, "incorrect_loss_raw": 11.42755921681722, "correct_loss_per_char": 0.3828497727711995, "incorrect_loss_per_char": 1.1572242842780218, "correct_loss_per_token": 4.5941972732543945, "incorrect_loss_per_token": 9.662473996480307, "correct_loss_uncond": -8.249516487121582, "incorrect_loss_uncond": -2.941653251647949}, "model_output": [{"sum_logits": -4.5941972732543945, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -12.843713760375977, "logits_per_token": -4.5941972732543945, "logits_per_char": -0.3828497727711995, "num_chars": 12}, {"sum_logits": -10.926352500915527, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.320901870727539, "logits_per_token": -10.926352500915527, "logits_per_char": -1.2140391667683919, "num_chars": 9}, {"sum_logits": -12.765813827514648, "num_tokens": 1, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -13.749855041503906, "logits_per_token": -12.765813827514648, "logits_per_char": -1.595726728439331, "num_chars": 8}, {"sum_logits": -10.590511322021484, "num_tokens": 2, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -16.036880493164062, "logits_per_token": -5.295255661010742, "logits_per_char": -0.6619069576263428, "num_chars": 16}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1159, "native_id": "Mercury_SC_405341", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 2, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 33.78582000732422, "incorrect_loss_raw": 24.52864138285319, "correct_loss_per_char": 0.649727307833158, "incorrect_loss_per_char": 0.5315996480604555, "correct_loss_per_token": 3.378582000732422, "incorrect_loss_per_token": 2.7851972724452163, "correct_loss_uncond": -16.81568145751953, "incorrect_loss_uncond": -14.310262680053711}, "model_output": [{"sum_logits": -33.78582000732422, "num_tokens": 10, "num_tokens_all": 192, "is_greedy": false, "sum_logits_uncond": -50.60150146484375, "logits_per_token": -3.378582000732422, "logits_per_char": -0.649727307833158, "num_chars": 52}, {"sum_logits": -22.114212036132812, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -35.34545135498047, "logits_per_token": -2.7642765045166016, "logits_per_char": -0.49142693413628474, "num_chars": 45}, {"sum_logits": -24.717695236206055, "num_tokens": 11, "num_tokens_all": 193, "is_greedy": false, "sum_logits_uncond": -42.92813491821289, "logits_per_token": -2.2470632032914595, "logits_per_char": -0.46637160823030294, "num_chars": 53}, {"sum_logits": -26.754016876220703, "num_tokens": 8, "num_tokens_all": 190, "is_greedy": false, "sum_logits_uncond": -38.243125915527344, "logits_per_token": -3.344252109527588, "logits_per_char": -0.6370004018147787, "num_chars": 42}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1160, "native_id": "Mercury_7283833", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 4.053396701812744, "incorrect_loss_raw": 6.303445657094319, "correct_loss_per_char": 0.2702264467875163, "incorrect_loss_per_char": 0.5214653470987538, "correct_loss_per_token": 4.053396701812744, "incorrect_loss_per_token": 6.303445657094319, "correct_loss_uncond": -9.93309736251831, "incorrect_loss_uncond": -7.590024471282959}, "model_output": [{"sum_logits": -7.936866283416748, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.378801345825195, "logits_per_token": -7.936866283416748, "logits_per_char": -0.6614055236180624, "num_chars": 12}, {"sum_logits": -4.053396701812744, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -13.986494064331055, "logits_per_token": -4.053396701812744, "logits_per_char": -0.2702264467875163, "num_chars": 15}, {"sum_logits": -4.856016635894775, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -12.83511734008789, "logits_per_token": -4.856016635894775, "logits_per_char": -0.3468583311353411, "num_chars": 14}, {"sum_logits": -6.1174540519714355, "num_tokens": 1, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -14.46649169921875, "logits_per_token": -6.1174540519714355, "logits_per_char": -0.5561321865428578, "num_chars": 11}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1161, "native_id": "Mercury_7159303", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 5.912652969360352, "incorrect_loss_raw": 11.78082275390625, "correct_loss_per_char": 0.3111922615452817, "incorrect_loss_per_char": 0.6299781656609723, "correct_loss_per_token": 2.956326484680176, "incorrect_loss_per_token": 5.890411376953125, "correct_loss_uncond": -12.248495101928711, "incorrect_loss_uncond": -5.998654683430989}, "model_output": [{"sum_logits": -5.912652969360352, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -18.161148071289062, "logits_per_token": -2.956326484680176, "logits_per_char": -0.3111922615452817, "num_chars": 19}, {"sum_logits": -11.114168167114258, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -17.42620277404785, "logits_per_token": -5.557084083557129, "logits_per_char": -0.5557084083557129, "num_chars": 20}, {"sum_logits": -14.691337585449219, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -19.333871841430664, "logits_per_token": -7.345668792724609, "logits_per_char": -0.7732282939710116, "num_chars": 19}, {"sum_logits": -9.536962509155273, "num_tokens": 2, "num_tokens_all": 238, "is_greedy": false, "sum_logits_uncond": -16.578357696533203, "logits_per_token": -4.768481254577637, "logits_per_char": -0.5609977946561926, "num_chars": 17}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1162, "native_id": "Mercury_406427", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 2, "predicted_index_uncond": 2, "correct_choice": 2, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 12.831721305847168, "incorrect_loss_raw": 17.481128692626953, "correct_loss_per_char": 0.35643670294019913, "incorrect_loss_per_char": 0.5148370929658769, "correct_loss_per_token": 1.8331030436924525, "incorrect_loss_per_token": 2.377628212883359, "correct_loss_uncond": -14.427908897399902, "incorrect_loss_uncond": -10.781606038411459}, "model_output": [{"sum_logits": -12.265380859375, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -23.90509796142578, "logits_per_token": -1.752197265625, "logits_per_char": -0.3956574470766129, "num_chars": 31}, {"sum_logits": -20.07245635986328, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -29.131389617919922, "logits_per_token": -2.8674937656947543, "logits_per_char": -0.5903663635253906, "num_chars": 34}, {"sum_logits": -12.831721305847168, "num_tokens": 7, "num_tokens_all": 213, "is_greedy": false, "sum_logits_uncond": -27.25963020324707, "logits_per_token": -1.8331030436924525, "logits_per_char": -0.35643670294019913, "num_chars": 36}, {"sum_logits": -20.105548858642578, "num_tokens": 8, "num_tokens_all": 214, "is_greedy": false, "sum_logits_uncond": -31.75171661376953, "logits_per_token": -2.5131936073303223, "logits_per_char": -0.5584874682956271, "num_chars": 36}], "label": 2, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1163, "native_id": "Mercury_SC_414129", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 10.494051933288574, "incorrect_loss_raw": 14.150089899698893, "correct_loss_per_char": 1.0494051933288575, "incorrect_loss_per_char": 1.0397953745368478, "correct_loss_per_token": 5.247025966644287, "incorrect_loss_per_token": 4.699601491292317, "correct_loss_uncond": -9.041974067687988, "incorrect_loss_uncond": -6.448894500732422}, "model_output": [{"sum_logits": -9.228911399841309, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -16.327280044555664, "logits_per_token": -4.614455699920654, "logits_per_char": -0.838991945440119, "num_chars": 11}, {"sum_logits": -10.494051933288574, "num_tokens": 2, "num_tokens_all": 183, "is_greedy": false, "sum_logits_uncond": -19.536026000976562, "logits_per_token": -5.247025966644287, "logits_per_char": -1.0494051933288575, "num_chars": 10}, {"sum_logits": -14.148110389709473, "num_tokens": 3, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -19.962936401367188, "logits_per_token": -4.716036796569824, "logits_per_char": -1.0883161838238056, "num_chars": 13}, {"sum_logits": -19.0732479095459, "num_tokens": 4, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -25.506736755371094, "logits_per_token": -4.768311977386475, "logits_per_char": -1.1920779943466187, "num_chars": 16}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1164, "native_id": "Mercury_7108990", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.627560615539551, "incorrect_loss_raw": 6.776346842447917, "correct_loss_per_char": 0.736395623948839, "incorrect_loss_per_char": 0.46864968976403915, "correct_loss_per_token": 3.3137803077697754, "incorrect_loss_per_token": 2.2587822808159723, "correct_loss_uncond": -8.483539581298828, "incorrect_loss_uncond": -8.957546552022299}, "model_output": [{"sum_logits": -6.627560615539551, "num_tokens": 2, "num_tokens_all": 178, "is_greedy": false, "sum_logits_uncond": -15.111100196838379, "logits_per_token": -3.3137803077697754, "logits_per_char": -0.736395623948839, "num_chars": 9}, {"sum_logits": -4.94127082824707, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.134930610656738, "logits_per_token": -1.6470902760823567, "logits_per_char": -0.3800977560190054, "num_chars": 13}, {"sum_logits": -7.499720573425293, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -15.69559097290039, "logits_per_token": -2.499906857808431, "logits_per_char": -0.4999813715616862, "num_chars": 15}, {"sum_logits": -7.888049125671387, "num_tokens": 3, "num_tokens_all": 179, "is_greedy": false, "sum_logits_uncond": -16.371158599853516, "logits_per_token": -2.629349708557129, "logits_per_char": -0.5258699417114258, "num_chars": 15}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1165, "native_id": "Mercury_SC_407315", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 0, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 31.462783813476562, "incorrect_loss_raw": 35.87076187133789, "correct_loss_per_char": 0.5157833412045338, "incorrect_loss_per_char": 0.6292615274650549, "correct_loss_per_token": 3.1462783813476562, "incorrect_loss_per_token": 3.8580388387044273, "correct_loss_uncond": -17.460315704345703, "incorrect_loss_uncond": -13.092330932617188}, "model_output": [{"sum_logits": -31.462783813476562, "num_tokens": 10, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -48.923099517822266, "logits_per_token": -3.1462783813476562, "logits_per_char": -0.5157833412045338, "num_chars": 61}, {"sum_logits": -35.6833610534668, "num_tokens": 9, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -46.61229705810547, "logits_per_token": -3.964817894829644, "logits_per_char": -0.6152303629908068, "num_chars": 58}, {"sum_logits": -34.452369689941406, "num_tokens": 10, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -50.172950744628906, "logits_per_token": -3.4452369689941404, "logits_per_char": -0.6264067216352983, "num_chars": 55}, {"sum_logits": -37.47655487060547, "num_tokens": 9, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -50.10403060913086, "logits_per_token": -4.164061652289496, "logits_per_char": -0.6461474977690598, "num_chars": 58}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1166, "native_id": "Mercury_SC_408663", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 1, "correct_choice": 1, "acc_raw": 1, "acc_per_token": 1, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 9.284074783325195, "incorrect_loss_raw": 17.3761994043986, "correct_loss_per_char": 0.18204068202598422, "incorrect_loss_per_char": 0.3417204760546178, "correct_loss_per_token": 1.0315638648139105, "incorrect_loss_per_token": 1.9306888227109555, "correct_loss_uncond": -17.793901443481445, "incorrect_loss_uncond": -14.419610023498535}, "model_output": [{"sum_logits": -22.529197692871094, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -36.24275207519531, "logits_per_token": -2.5032441880967884, "logits_per_char": -0.4505839538574219, "num_chars": 50}, {"sum_logits": -9.284074783325195, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -27.07797622680664, "logits_per_token": -1.0315638648139105, "logits_per_char": -0.18204068202598422, "num_chars": 51}, {"sum_logits": -14.21003532409668, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -30.0019588470459, "logits_per_token": -1.5788928137885199, "logits_per_char": -0.2786281436097388, "num_chars": 51}, {"sum_logits": -15.389365196228027, "num_tokens": 9, "num_tokens_all": 204, "is_greedy": false, "sum_logits_uncond": -29.142717361450195, "logits_per_token": -1.7099294662475586, "logits_per_char": -0.2959493306966928, "num_chars": 52}], "label": 1, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1167, "native_id": "MEA_2013_8_18", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 3, "predicted_index_per_char": 3, "predicted_index_uncond": 1, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 8.67662525177002, "incorrect_loss_raw": 12.343549092610678, "correct_loss_per_char": 0.4566644869352642, "incorrect_loss_per_char": 0.4144512573922947, "correct_loss_per_token": 2.169156312942505, "incorrect_loss_per_token": 1.895473559697469, "correct_loss_uncond": -16.429991722106934, "incorrect_loss_uncond": -15.524269739786783}, "model_output": [{"sum_logits": -8.67662525177002, "num_tokens": 4, "num_tokens_all": 182, "is_greedy": false, "sum_logits_uncond": -25.106616973876953, "logits_per_token": -2.169156312942505, "logits_per_char": -0.4566644869352642, "num_chars": 19}, {"sum_logits": -12.545467376708984, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -32.286991119384766, "logits_per_token": -2.0909112294514975, "logits_per_char": -0.5227278073628744, "num_chars": 24}, {"sum_logits": -12.836687088012695, "num_tokens": 6, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -27.23873519897461, "logits_per_token": -2.1394478480021157, "logits_per_char": -0.41408668025847406, "num_chars": 31}, {"sum_logits": -11.648492813110352, "num_tokens": 8, "num_tokens_all": 186, "is_greedy": false, "sum_logits_uncond": -24.077730178833008, "logits_per_token": -1.456061601638794, "logits_per_char": -0.3065392845555356, "num_chars": 38}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1168, "native_id": "Mercury_7111125", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 0, "predicted_index_per_char": 1, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 1, "acc_per_char": 0, "acc_uncond": 1, "correct_loss_raw": 25.917804718017578, "incorrect_loss_raw": 26.04258092244466, "correct_loss_per_char": 0.6645590953337841, "incorrect_loss_per_char": 0.6950981200687468, "correct_loss_per_token": 3.2397255897521973, "incorrect_loss_per_token": 4.91692394680447, "correct_loss_uncond": -18.287105560302734, "incorrect_loss_uncond": -6.480980555216472}, "model_output": [{"sum_logits": -25.917804718017578, "num_tokens": 8, "num_tokens_all": 187, "is_greedy": false, "sum_logits_uncond": -44.20491027832031, "logits_per_token": -3.2397255897521973, "logits_per_char": -0.6645590953337841, "num_chars": 39}, {"sum_logits": -21.029800415039062, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -30.693262100219727, "logits_per_token": -4.205960083007812, "logits_per_char": -0.6008514404296875, "num_chars": 35}, {"sum_logits": -26.243301391601562, "num_tokens": 6, "num_tokens_all": 185, "is_greedy": false, "sum_logits_uncond": -32.02458953857422, "logits_per_token": -4.373883565266927, "logits_per_char": -0.7498086111886161, "num_chars": 35}, {"sum_logits": -30.85464096069336, "num_tokens": 5, "num_tokens_all": 184, "is_greedy": false, "sum_logits_uncond": -34.85283279418945, "logits_per_token": -6.170928192138672, "logits_per_char": -0.7346343085879371, "num_chars": 42}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1169, "native_id": "LEAP_2009_8_10430", "metrics": {"predicted_index_raw": 1, "predicted_index_per_token": 3, "predicted_index_per_char": 0, "predicted_index_uncond": 0, "correct_choice": 0, "acc_raw": 0, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 1, "correct_loss_raw": 37.183998107910156, "incorrect_loss_raw": 38.63021596272787, "correct_loss_per_char": 0.5164444181654189, "incorrect_loss_per_char": 0.6609967965928335, "correct_loss_per_token": 2.8603075467623196, "incorrect_loss_per_token": 3.0970332032375167, "correct_loss_uncond": -21.713966369628906, "incorrect_loss_uncond": -17.452640533447266}, "model_output": [{"sum_logits": -37.183998107910156, "num_tokens": 13, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -58.89796447753906, "logits_per_token": -2.8603075467623196, "logits_per_char": -0.5164444181654189, "num_chars": 72}, {"sum_logits": -34.007667541503906, "num_tokens": 9, "num_tokens_all": 195, "is_greedy": false, "sum_logits_uncond": -52.50194549560547, "logits_per_token": -3.7786297268337674, "logits_per_char": -0.6940340314592633, "num_chars": 49}, {"sum_logits": -38.44427490234375, "num_tokens": 13, "num_tokens_all": 199, "is_greedy": false, "sum_logits_uncond": -55.07111358642578, "logits_per_token": -2.9572519155649037, "logits_per_char": -0.6102265857514881, "num_chars": 63}, {"sum_logits": -43.43870544433594, "num_tokens": 17, "num_tokens_all": 203, "is_greedy": false, "sum_logits_uncond": -60.67551040649414, "logits_per_token": -2.5552179673138786, "logits_per_char": -0.678729772567749, "num_chars": 64}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1170, "native_id": "Mercury_7165218", "metrics": {"predicted_index_raw": 3, "predicted_index_per_token": 2, "predicted_index_per_char": 3, "predicted_index_uncond": 2, "correct_choice": 3, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 1, "acc_uncond": 0, "correct_loss_raw": 3.016491413116455, "incorrect_loss_raw": 6.297064622243245, "correct_loss_per_char": 0.43092734473092215, "incorrect_loss_per_char": 0.9311487742832728, "correct_loss_per_token": 3.016491413116455, "incorrect_loss_per_token": 5.5554845333099365, "correct_loss_uncond": -11.5770583152771, "incorrect_loss_uncond": -8.71239455540975}, "model_output": [{"sum_logits": -5.506818771362305, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -13.811325073242188, "logits_per_token": -5.506818771362305, "logits_per_char": -0.7866883959089007, "num_chars": 7}, {"sum_logits": -8.934894561767578, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -14.00579833984375, "logits_per_token": -8.934894561767578, "logits_per_char": -1.1168618202209473, "num_chars": 8}, {"sum_logits": -4.4494805335998535, "num_tokens": 2, "num_tokens_all": 207, "is_greedy": false, "sum_logits_uncond": -17.211254119873047, "logits_per_token": -2.2247402667999268, "logits_per_char": -0.8898961067199707, "num_chars": 5}, {"sum_logits": -3.016491413116455, "num_tokens": 1, "num_tokens_all": 206, "is_greedy": false, "sum_logits_uncond": -14.593549728393555, "logits_per_token": -3.016491413116455, "logits_per_char": -0.43092734473092215, "num_chars": 7}], "label": 3, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"} +{"doc_id": 1171, "native_id": "MEA_2013_8_15", "metrics": {"predicted_index_raw": 0, "predicted_index_per_token": 1, "predicted_index_per_char": 1, "predicted_index_uncond": 2, "correct_choice": 0, "acc_raw": 1, "acc_per_token": 0, "acc_per_char": 0, "acc_uncond": 0, "correct_loss_raw": 6.2745232582092285, "incorrect_loss_raw": 13.115719318389893, "correct_loss_per_char": 0.6274523258209228, "incorrect_loss_per_char": 0.471288079639961, "correct_loss_per_token": 3.1372616291046143, "incorrect_loss_per_token": 2.3930676645702786, "correct_loss_uncond": -7.44691801071167, "incorrect_loss_uncond": -10.935168743133545}, "model_output": [{"sum_logits": -6.2745232582092285, "num_tokens": 2, "num_tokens_all": 194, "is_greedy": false, "sum_logits_uncond": -13.721441268920898, "logits_per_token": -3.1372616291046143, "logits_per_char": -0.6274523258209228, "num_chars": 10}, {"sum_logits": -7.456120014190674, "num_tokens": 4, "num_tokens_all": 196, "is_greedy": false, "sum_logits_uncond": -18.95738983154297, "logits_per_token": -1.8640300035476685, "logits_per_char": -0.3728060007095337, "num_chars": 20}, {"sum_logits": -13.753981590270996, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -28.338302612304688, "logits_per_token": -2.292330265045166, "logits_per_char": -0.474275227250724, "num_chars": 29}, {"sum_logits": -18.137056350708008, "num_tokens": 6, "num_tokens_all": 198, "is_greedy": false, "sum_logits_uncond": -24.856971740722656, "logits_per_token": -3.0228427251180015, "logits_per_char": -0.5667830109596252, "num_chars": 32}], "label": 0, "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5"}