44import importlib .util
55import numpy as np
66import time
7+ import concurrent .futures
8+ import threading
9+
10+ def run_with_timeout (func , args = (), kwargs = {}, timeout_seconds = 5 ):
11+ """
12+ Run a function with a timeout using concurrent.futures
13+
14+ Args:
15+ func: Function to run
16+ args: Arguments to pass to the function
17+ kwargs: Keyword arguments to pass to the function
18+ timeout_seconds: Timeout in seconds
19+
20+ Returns:
21+ Result of the function or raises TimeoutError
22+ """
23+ with concurrent .futures .ThreadPoolExecutor (max_workers = 1 ) as executor :
24+ future = executor .submit (func , * args , ** kwargs )
25+ try :
26+ return future .result (timeout = timeout_seconds )
27+ except concurrent .futures .TimeoutError :
28+ raise TimeoutError (f"Function { func .__name__ } timed out after { timeout_seconds } seconds" )
729
830def evaluate (program_path ):
931 """
@@ -21,71 +43,156 @@ def evaluate(program_path):
2143 GLOBAL_MIN_Y = - 1.03
2244 GLOBAL_MIN_VALUE = - 2.104
2345
24- # Load the program
25- spec = importlib .util .spec_from_file_location ("program" , program_path )
26- program = importlib .util .module_from_spec (spec )
27- spec .loader .exec_module (program )
28-
29- # Run multiple trials
30- num_trials = 10
31- values = []
32- distances = []
33- times = []
34-
35- for _ in range (num_trials ):
36- start_time = time .time ()
37- x , y , value = program .run_search ()
38- end_time = time .time ()
46+ try :
47+ # Load the program
48+ spec = importlib .util .spec_from_file_location ("program" , program_path )
49+ program = importlib .util .module_from_spec (spec )
50+ spec .loader .exec_module (program )
51+
52+ # Check if the required function exists
53+ if not hasattr (program , "run_search" ):
54+ print (f"Error: program does not have 'run_search' function" )
55+ return {
56+ "value_score" : 0.0 ,
57+ "distance_score" : 0.0 ,
58+ "speed_score" : 0.0 ,
59+ "combined_score" : 0.0 ,
60+ "error" : "Missing run_search function"
61+ }
62+
63+ # Run multiple trials
64+ num_trials = 10
65+ values = []
66+ distances = []
67+ times = []
68+ success_count = 0
69+
70+ for trial in range (num_trials ):
71+ try :
72+ start_time = time .time ()
73+
74+ # Run with timeout
75+ x , y , value = run_with_timeout (program .run_search , timeout_seconds = 5 )
76+
77+ end_time = time .time ()
78+
79+ # Check if the result is valid (not NaN or infinite)
80+ if (np .isnan (x ) or np .isnan (y ) or np .isnan (value ) or
81+ np .isinf (x ) or np .isinf (y ) or np .isinf (value )):
82+ print (f"Trial { trial } : Invalid result, got x={ x } , y={ y } , value={ value } " )
83+ continue
84+
85+ # Ensure all values are float
86+ x , y , value = float (x ), float (y ), float (value )
87+
88+ # Calculate metrics
89+ distance_to_global = np .sqrt ((x - GLOBAL_MIN_X )** 2 + (y - GLOBAL_MIN_Y )** 2 )
90+ value_difference = abs (value - GLOBAL_MIN_VALUE )
91+
92+ values .append (value )
93+ distances .append (distance_to_global )
94+ times .append (end_time - start_time )
95+ success_count += 1
96+
97+ except TimeoutError as e :
98+ print (f"Trial { trial } : { str (e )} " )
99+ continue
100+ except Exception as e :
101+ print (f"Trial { trial } : Error - { str (e )} " )
102+ continue
103+
104+ # If all trials failed, return zero scores
105+ if success_count == 0 :
106+ return {
107+ "value_score" : 0.0 ,
108+ "distance_score" : 0.0 ,
109+ "speed_score" : 0.0 ,
110+ "combined_score" : 0.0 ,
111+ "error" : "All trials failed"
112+ }
39113
40114 # Calculate metrics
41- distance_to_global = np .sqrt ((x - GLOBAL_MIN_X )** 2 + (y - GLOBAL_MIN_Y )** 2 )
42- value_difference = abs (value - GLOBAL_MIN_VALUE )
115+ avg_value = np .mean (values )
116+ avg_distance = np .mean (distances )
117+ avg_time = np .mean (times )
43118
44- values .append (value )
45- distances .append (distance_to_global )
46- times .append (end_time - start_time )
47-
48- # Calculate metrics
49- avg_value = np .mean (values )
50- avg_distance = np .mean (distances )
51- avg_time = np .mean (times )
52-
53- # Convert to scores (higher is better)
54- value_score = 1.0 / (1.0 + abs (avg_value - GLOBAL_MIN_VALUE )) # Normalize and invert
55- distance_score = 1.0 / (1.0 + avg_distance )
56- speed_score = 1.0 / avg_time
57-
58- # Normalize speed score (so it doesn't dominate)
59- speed_score = min (speed_score , 10.0 ) / 10.0
60-
61- return {
62- "value_score" : value_score ,
63- "distance_score" : distance_score ,
64- "speed_score" : speed_score ,
65- "combined_score" : 0.6 * value_score + 0.3 * distance_score + 0.1 * speed_score
66- }
119+ # Convert to scores (higher is better)
120+ value_score = 1.0 / (1.0 + abs (avg_value - GLOBAL_MIN_VALUE )) # Normalize and invert
121+ distance_score = 1.0 / (1.0 + avg_distance )
122+ speed_score = 1.0 / avg_time if avg_time > 0 else 0.0
123+
124+ # Normalize speed score (so it doesn't dominate)
125+ speed_score = min (speed_score , 10.0 ) / 10.0
126+
127+ # Add reliability score based on success rate
128+ reliability_score = success_count / num_trials
129+
130+ return {
131+ "value_score" : value_score ,
132+ "distance_score" : distance_score ,
133+ "speed_score" : speed_score ,
134+ "reliability_score" : reliability_score ,
135+ "combined_score" : 0.5 * value_score + 0.2 * distance_score + 0.1 * speed_score + 0.2 * reliability_score ,
136+ "success_rate" : reliability_score
137+ }
138+ except Exception as e :
139+ print (f"Evaluation failed completely: { str (e )} " )
140+ return {
141+ "value_score" : 0.0 ,
142+ "distance_score" : 0.0 ,
143+ "speed_score" : 0.0 ,
144+ "combined_score" : 0.0 ,
145+ "error" : str (e )
146+ }
67147
68148# Stage-based evaluation for cascade evaluation
69149def evaluate_stage1 (program_path ):
70150 """First stage evaluation with fewer trials"""
151+ # Known global minimum (approximate)
152+ GLOBAL_MIN_X = - 1.76
153+ GLOBAL_MIN_Y = - 1.03
154+ GLOBAL_MIN_VALUE = - 2.104
155+
71156 # Quick check to see if the program runs without errors
72157 try :
73158 # Load the program
74159 spec = importlib .util .spec_from_file_location ("program" , program_path )
75160 program = importlib .util .module_from_spec (spec )
76161 spec .loader .exec_module (program )
77162
78- # Run a single trial
79- x , y , value = program .run_search ()
163+ # Check if the required function exists
164+ if not hasattr (program , "run_search" ):
165+ print (f"Stage 1 validation: Program does not have 'run_search' function" )
166+ return {"runs_successfully" : 0.0 , "error" : "Missing run_search function" }
80167
81- # Basic metrics
82- return {
83- "runs_successfully" : 1.0 ,
84- "value" : float (value )
85- }
168+ try :
169+ # Run a single trial with timeout
170+ x , y , value = run_with_timeout (program .run_search , timeout_seconds = 5 )
171+
172+ # Ensure all values are float
173+ x , y , value = float (x ), float (y ), float (value )
174+
175+ # Check if the result is valid
176+ if np .isnan (x ) or np .isnan (y ) or np .isnan (value ) or np .isinf (x ) or np .isinf (y ) or np .isinf (value ):
177+ print (f"Stage 1 validation: Invalid result, got x={ x } , y={ y } , value={ value } " )
178+ return {"runs_successfully" : 0.5 , "error" : "Invalid result values" }
179+
180+ # Basic metrics
181+ return {
182+ "runs_successfully" : 1.0 ,
183+ "value" : float (value ),
184+ "distance" : float (np .sqrt ((x - GLOBAL_MIN_X )** 2 + (y - GLOBAL_MIN_Y )** 2 )) # Distance to known minimum
185+ }
186+ except TimeoutError as e :
187+ print (f"Stage 1 evaluation timed out: { e } " )
188+ return {"runs_successfully" : 0.0 , "error" : "Timeout" }
189+ except Exception as e :
190+ print (f"Stage 1 evaluation failed: { e } " )
191+ return {"runs_successfully" : 0.0 , "error" : str (e )}
192+
86193 except Exception as e :
87194 print (f"Stage 1 evaluation failed: { e } " )
88- return {"runs_successfully" : 0.0 }
195+ return {"runs_successfully" : 0.0 , "error" : str ( e ) }
89196
90197def evaluate_stage2 (program_path ):
91198 """Second stage evaluation with more thorough testing"""
0 commit comments