88from datasets import load_dataset
99from tqdm import tqdm
1010
11- client = OpenAI (api_key = "none" , base_url = "http://localhost:8000/v1" )
12- SLEEP_INTERVAL = 60
11+ client = OpenAI (api_key = os .environ .get ("OPENAI_API_KEY" ), base_url = "http://localhost:8000/v1" )
12+ # client = OpenAI()
13+ SLEEP_INTERVAL = 300
1314
1415def load_existing_results (filename : str ) -> List [Dict ]:
1516 try :
@@ -29,7 +30,6 @@ def get_last_processed_index(results: List[Dict]) -> int:
2930 return - 1
3031 return max (int (r .get ('index' , - 1 )) for r in results )
3132
32-
3333def generate_llm_prompt (prompt : str , wiki_links : List [str ]) -> str :
3434 return f"Here are the relevant Wikipedia articles:\n { wiki_links } \n \n Based on all the information, answer the query. \n \n Query: { prompt } \n \n "
3535
@@ -54,7 +54,7 @@ def evaluate_response(question: str, llm_response: str, ground_truth: str, model
5454response. Please analyze the provided data and make a decision.
5555===Instructions===
56561. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
57- 2. Consider the substance of the answers – look for equivalent information or correct answers.
57+ 2. Consider the substance of the answers - look for equivalent information or correct answers.
5858Do not focus on exact wording unless the exact wording is crucial to the meaning.
59593. Your final decision should be based on whether the meaning and the vital facts of the
6060"Ground Truth Answer" are present in the "Predicted Answer:"
@@ -122,8 +122,8 @@ def main(model: str):
122122 }
123123
124124 save_result (filename , result )
125- print (f"Index: { index } , Decision: { result ['evaluation_decision' ]} " )
126- time .sleep (SLEEP_INTERVAL )
125+ # print(f"Index: {index}, Decision: {result['evaluation_decision']}")
126+ # time.sleep(SLEEP_INTERVAL)
127127
128128 # Calculate and print summary statistics
129129 results = load_existing_results (filename )
0 commit comments