jerpint commited on
Commit
62f8b70
·
1 Parent(s): 682ff2e

evaluate python functions on inputs

Browse files
Files changed (1) hide show
  1. evaluate.py +143 -0
evaluate.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
4
+ import pandas as pd
5
+ # from sklearn.manifold import TSNE
6
+
7
+ from generate import get_solution_file_path, all_models
8
+ from openai import OpenAI
9
+ import time
10
+
11
+ import os
12
+ import subprocess
13
+
14
+
15
+
16
+ client = OpenAI()
17
+
18
+
19
+ def evaluate_submission(day: int, model: str):
20
+ """Evaluates the submission for the given day and model. Returns the result captured from stdout and the total time taken."""
21
+
22
+ # cd to the day directory
23
+ os.chdir(f"day{day:02d}")
24
+
25
+ # get the solution file path, check if it exists
26
+ file_path = get_solution_file_path(model=model)
27
+ if not os.path.exists(file_path):
28
+ print(f"File {file_path} does not exist, skipping")
29
+ return
30
+ else:
31
+ print(f"Evaluating {file_path} for day {day} with model {model}")
32
+
33
+ # run the solution, and capture the output
34
+ timeout = 60 * 5
35
+ start_time = time.time()
36
+ try:
37
+ result = subprocess.run(["python", file_path], capture_output=True, text=True, timeout=timeout)
38
+ print(f"Result: {result.stdout}")
39
+ except subprocess.TimeoutExpired:
40
+ result = subprocess.CompletedProcess(args=["python", file_path], returncode=1, stdout="", stderr="Timeout")
41
+ print(f"Timeout after {timeout} seconds")
42
+ end_time = time.time()
43
+
44
+ total_time = end_time - start_time
45
+
46
+ result = result.stdout if result.returncode == 0 else f"Error: {result.stderr}"
47
+
48
+ os.chdir("..")
49
+
50
+ return {
51
+ "result": result,
52
+ "total_time": total_time,
53
+ }
54
+
55
+
56
+ def get_solution_code(day: int, model: str) -> str:
57
+ """Returns the solution code (as a string) for the given day and model."""
58
+ file_path = get_solution_file_path(day=day, model=model)
59
+ with open(file_path, "r") as file:
60
+ return file.read()
61
+
62
+
63
+ def extract_solutions(df, output_file = "solutions.json"):
64
+ # TODO: better way of getting this?
65
+ solutions = {}
66
+ for day in range(1, 25):
67
+ sub_df = df[(df.model == "jerpint") & (df.day == day)]
68
+ part1, part2 = sub_df.result.to_list()[0].strip("\n").split("\n")
69
+ solutions[day] = [part1, part2]
70
+
71
+ with open(output_file, "w") as f:
72
+ json.dump(solutions, f, indent=2)
73
+
74
+ return solutions
75
+
76
+
77
+ def evaluate_submissions(all_models, results_file = "results.csv", skip = True):
78
+ """Runs the python code and collects their results"""
79
+
80
+ if os.path.exists(results_file):
81
+ df = pd.read_csv(results_file)
82
+ else:
83
+ df = pd.DataFrame(columns=["day", "model", "result", "total_time"])
84
+
85
+ # for day in range(1, 26):
86
+ for day in range(1, 11):
87
+ print("*" * 80)
88
+ print(f"Evaluating day {day}")
89
+ for provider in all_models:
90
+ for model in all_models[provider]:
91
+ print("-" * 80)
92
+
93
+ if df.loc[(df["day"] == day) & (df["model"] == model)].shape[0] > 0 and skip:
94
+ print(f"Skipping {provider} {model} for day {day} because it already exists")
95
+ continue
96
+
97
+ print(f"Evaluating day {day} with model {model}")
98
+ result = evaluate_submission(day, model)
99
+ df = pd.concat([df, pd.DataFrame({"day": [day], "model": [model], "result": [result["result"]], "total_time": [result["total_time"]]})], ignore_index=True)
100
+
101
+ df.to_csv("results.csv", index=False)
102
+ print("-" * 80)
103
+ print("*" * 80)
104
+ return df
105
+
106
+
107
+
108
+ if __name__ == "__main__":
109
+ all_models["human"] = ["jerpint"]
110
+ df = evaluate_submissions(all_models, results_file="results.csv")
111
+
112
+ # For now, only evaluate first 9 days
113
+ # TODO: All days
114
+ df = df[df.day < 10]
115
+
116
+ # Run once to save results
117
+ # solutions = extract_solutions(df)
118
+
119
+ with open("solutions.json") as f:
120
+ solutions = json.load(f)
121
+
122
+ def score_submissions(row):
123
+ result = row["result"]
124
+ day = row["day"]
125
+ solution = solutions[str(day)]
126
+
127
+ score_1 = solution[0] in result
128
+ score_2 = solution[1] in result
129
+ return [score_1, score_2]
130
+
131
+
132
+ df["scores"] = df.apply(score_submissions, axis=1)
133
+
134
+ df["part_1"] = df["scores"].apply(lambda x: x[0])
135
+ df["part_2"] = df["scores"].apply(lambda x: x[1])
136
+
137
+ for model in df.model.unique():
138
+ df_model = df[df.model == model]
139
+ silver_stars = df_model.part_1.sum()
140
+ gold_stars = df_model.part_2.sum()
141
+ total_stars = silver_stars + gold_stars
142
+
143
+ print(model, total_stars)