diff --git a/.gitignore b/.gitignore index 7fac77a..f5a2ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ -apla-planner/generated_pddls_deepseek/dataset/* -apla-planner/generated_pddls_deepseek/logs/* +apla-planner/* generated_pddls/* parsed_womdr_data/* pddl-examples/* *.pddl __pycache__/* -apla-planner/generated_pddls_deepseek/.DS_Store +v3-grades/* +plt-graph-v3/* +cot_file_size_graphs/* +abc_bar_graph/* grades/* +plt-graph/* diff --git a/apla-planner/generated_pddls_deepseek/planner_test.py b/apla-planner/generated_pddls_deepseek/planner_test.py index 2ca08db..3f1fb26 100644 --- a/apla-planner/generated_pddls_deepseek/planner_test.py +++ b/apla-planner/generated_pddls_deepseek/planner_test.py @@ -1,7 +1,6 @@ from jupyddl import AutomatedPlanner # Comment this line along with the other planner lines if running from outside WSL import os import json -import matplotlib.pyplot as plt ## There is one context per scenario. Each context has a corresponding PDDL domain file. ## Each scenario has multiple interactions. Each interaction will have one PDDL problem file. @@ -22,11 +21,12 @@ # We will traverse the problem list since there will be only one domain per scenario - plans_for_one_scenario = {} problem_coverage_scores = [] problem_initial_state_sizes = [] print("Scenario ID is {}".format(scenario_folder)) for problem_file_name in problems_within_scenario: + print(f"Considering problem file {problem_file_name}") + plans_for_one_scenario = {} problem_full_path = "dataset/problems/"+scenario_folder+"/"+problem_file_name domain_full_path = "dataset/domains/"+scenario_folder+"/"+domains_within_scenario[0] print("Planner is now running for the problem {}".format(problem_file_name)) @@ -39,6 +39,5 @@ except: continue - - with open("dataset/problems/"+scenario_folder+"/plan_set.json", 'w') as file: - json.dump(plans_for_one_scenario, file) + with open("dataset/problems/"+scenario_folder+"/"+problem_file_name+"_"+"plan_set.json", 'w') as file: + json.dump(plans_for_one_scenario, file) diff --git a/basic_scenario_gpt.py b/basic_scenario_gpt.py new file mode 100644 index 0000000..e4a24d6 --- /dev/null +++ b/basic_scenario_gpt.py @@ -0,0 +1,194 @@ +from guidance import models, gen, user, assistant, system +import parse_scenario_womd +import json + + +def generate_scenario_concepts(granularity, scenario_data): + gpt_scenario = models.OpenAI(model="gpt-4o", echo=False) + + with system(): + lm_scenario = gpt_scenario + + with user(): + lm_scenario += f""" + Think deeply about scenarios for testing autonomous vehicles. + + I need some states of the world that would be relevant for logically describing this traffic scenario: + {scenario_data} + + A state is just an assertion with a true or false value that's representing the world in that particular moment. + This is similar to the concept of a turn in a turn based game. + + There must be states regarding the following concepts: + * Static environment description. + * Ego agent + * The respective surrounding agents. + + In each action and state, the ego agent or the surrounding agent must be identified as or or as needed. + + Increase the granularity of the concepts in proportion to the granularity level. + The granularity level is {str(granularity)} on a scale of 1 to 10 with 1 being the least and 10 being the most granular + Granularity pertains to how specific the information is. + + Make sure to rewrite the concepts given in the generated list of concepts in addition to your concepts. + """ + + with assistant(): + lm_scenario += gen("concepts", temperature=0.5) + + print("The scenario concepts are {}".format(lm_scenario["concepts"])) + return lm_scenario["concepts"] + +def generate_scenario_states(concepts): + gpt_scenario = models.OpenAI(model="gpt-4o", echo=False) + + with system(): + lm_scenario = gpt_scenario + + with user(): + lm_scenario += f""" + Based on the concepts detailed in {concepts}, + Write down a list of states pertaining to these concepts in natural language. Write them in the following format: + ```json + + "": + "statement": " + , + "": + "statement": ", + , + ... + + json``` + + Be very very very specific. + """ + + with assistant(): + lm_scenario += gen("state_dictionary", temperature=0.5) + + return lm_scenario["state_dictionary"] + +def generate_scenario_actions(concepts, granularity=2): + gpt_scenario = models.OpenAI(model="gpt-4o", echo=False) + + with system(): + lm_scenario = gpt_scenario + + with user(): + lm_scenario += f""" + Based on the concepts detailed in {concepts}, + * Write down a list of actions that map between these states in natural language. + * Each action has some causal states (predicates) and some effect states that will be true or false. + * Each action is a cause and effect mapping between any number of causal states and any number of effect states. + * Actions and states must not contradict each other. + * Action names must be descriptive and the action can be understood just by looking at the name. + * The state names within each action are also descriptive. The cause and effect statements and the state names must have the same information. + * There must be separate states regarding the environment, ego and the respective surrounding agents. + * In each action and state, the ego agent or the surrounding agent must be identified as or or as needed. + * For distances, positions and speeds do not use specific numbers but words instead such as front, left, right, near, far, fast, slow, medium (or combinations such as front-left and so on) or other similar descriptive words. + * The action itself will only become true when the causal states and the effect states are in the specific states that this description details. + * Write them in the following format: + ```json + + "": + + "": + "statement": " + "value": , + "state_type": + , + "": + "statement": " + "value": , + "state_type": + + , + ... + + json``` + + Increase the granularity of these actions in proportion to the granularity level. + Granularity pertains to how specific the information is. + While the actions must be relevant to the given scenario, they must be general enough to be used for other scenarios as well. + The granularity level is {str(granularity)} on a scale of 1 to 10 with 1 being the least and 10 being the most granular + + """ + + with assistant(): + lm_scenario += gen("action_dictionary", temperature=0.8) + + print("The scenario actions are {}".format(lm_scenario["action_dictionary"])) + return lm_scenario["action_dictionary"] + +# # Removed from this project after consideration +# def generate_scenario_states(concepts): +# gpt_scenario = models.OpenAI(model="gpt-4o", echo=False) + +# with system(): +# lm_scenario = gpt_scenario + +# with user(): +# lm_scenario += f""" +# Based on the concepts detailed in {concepts}, +# Write down a list of states pertaining to these concepts in natural language. Write them in the following format: +# ```json +# +# "": +# "statement": " +# , +# "": +# "statement": ", +# , +# ... +# +# json``` + +# Be very very very specific and granular. Very granualar, fine details and specific. +# """ + +# with assistant(): +# lm_scenario += gen("state_dictionary", temperature=0.8) + +# return lm_scenario["state_dictionary"] + +def respond_scenario_query(concepts, actions, questions): + gpt_scenario = models.OpenAI(model="gpt-4o", echo=False) + + with system(): + lm_scenario = gpt_scenario + + with user(): + lm_scenario += f""" + Based on the concepts detailed in {concepts} and actions detailed in {actions}, respond to the following questions: + {questions} + Be very specific and very granular. Very granual, fine details and specific. + """ + + with assistant(): + lm_scenario += gen("scenario_response", temperature=0.8) + + #print("The scenario responses are {}".format(lm_scenario["scenario_response"])) + return lm_scenario["scenario_response"] + +def evaluate_gpt(question): + gpt_scenario = models.OpenAI(model="gpt-4o-mini", echo=False) + + with system(): + lm_scenario = gpt_scenario + + with user(): + lm_scenario += f""" + Given the questions here: + {question} + + Choose the correct answer. Only mention the option. + """ + + with assistant(): + lm_scenario += gen("mcq_response", temperature=0.5) + + #print("The scenario responses are {}".format(lm_scenario["scenario_response"])) + return lm_scenario["mcq_response"] + + \ No newline at end of file diff --git a/box_plots.py b/box_plots.py new file mode 100644 index 0000000..77b9e16 --- /dev/null +++ b/box_plots.py @@ -0,0 +1,34 @@ +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np + +#manually changed indices and scores for each graph +exp_and_scores = { + 'Zero-Shot': [6, 3, 2, 1, 3, 2, 8, 8, 2, 10, 10, 3, 10, 2, 2, 8, 7, 4, 2, 5, 10, 10, 4, 10, 3, 3, 10, 2, 1, 2, 9, 10, 8, 8, 3, 10, 6, 3, 6, 8, 8, 2, 6, 6, 8, 3, 2, 5, 8, 2, 10, 1, 8, 10, 6, 7, 7, 8, 8, 5, 6, 10, 10, 4], + 'Two-Shot': [4, 10, 10, 10, 10, 4, 10, 8, 8, 10, 10, 2, 2, 10, 4, 8, 7, 10, 10, 10, 10, 8, 10, 10, 10, 10, 10, 10, 10, 10, 9, 10, 6, 8, 4, 5, 4, 2, 6, 3, 9, 7, 7, 4, 10, 6, 10, 10, 10, 10, 10, 5, 6, 4, 8, 3, 10, 10, 10, 10, 10, 10, 10, 3], + 'Four-Shot': [3, 10, 1, 2, 2, 2, 8, 9, 9, 10, 10, 2, 2, 10, 3, 9, 6, 10, 8, 10, 10, 10, 5, 10, 10, 10, 10, 10, 10, 10, 7, 10, 6, 5, 3, 4, 6, 3, 3, 2, 7, 2, 9, 4, 10, 2, 10, 10, 10, 10, 10, 10, 5, 3, 4, 2, 8, 10, 10, 10, 10, 10, 8, 2], + 'Six-Shot': [4, 10, 1, 10, 3, 4, 6, 10, 10, 10, 8, 2, 2, 10, 3, 7, 6, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10, 7, 4, 2, 8, 4, 3, 4, 6, 8, 4, 10, 3, 9, 6, 10, 10, 10, 8, 10, 2, 5, 6, 3, 8, 5, 10, 9, 10, 10, 10, 10, 3] +} + +data = [] +for experiment, scores in exp_and_scores.items(): + score_array = np.array(scores) + sorted_array = np.sort(score_array) + for individual_score in sorted_array: + data.append({'CoT Prompting Style': experiment, 'Correctness Scores': individual_score}) + +df = pd.DataFrame(data) +sns.set_theme(style="whitegrid") +plt.figure(figsize=(10, 6)) +#creates the box plots +ax = sns.boxplot(x='CoT Prompting Style', y='Correctness Scores', data=df, width=0.5, fliersize=0) +#inserting data points +sns.stripplot(x='CoT Prompting Style', y='Correctness Scores', data=df, jitter=0.23, color='black', size=6, alpha=0.7) +for i, (experiment, scores) in enumerate(exp_and_scores.items()): + q1_label = np.percentile(scores, 25) + ax.text(i, q1_label, f'Q1: {q1_label:.2f}', ha = 'center', va = 'bottom', color = 'white', fontsize = 12) + +plt.ylim(0, 11) +plt.title('Zero, Two, Four, and Six-Shot CoT Prompting Score Distribution for Scenarios of Large Files') +plt.show() \ No newline at end of file diff --git a/client_model_setup.py b/client_model_setup.py index 6010dd6..815429d 100644 --- a/client_model_setup.py +++ b/client_model_setup.py @@ -9,7 +9,7 @@ class ProvidedLLM(): def __init__(self): self.client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) self.client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai") - self.client_dsapi = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") + #self.client_dsapi = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") # The following are model names for DS models provided via their own API service. self.ds_v3_dsapi = "deepseek-chat" @@ -18,6 +18,7 @@ def __init__(self): # The following are model names for Large DeepInfra provided models self.ds_v3 = "deepseek-ai/DeepSeek-V3" self.ds_r1 = "deepseek-ai/DeepSeek-R1" # This model thinks. Cannot use for json output + self.ds_r1_turbo = "deepseek-ai/DeepSeek-R1-Turbo" self.llama_33_70b = "meta-llama/Llama-3.3-70B-Instruct-Turbo" self.llama_31_405b = "meta-llama/Meta-Llama-3.1-405B-Instruct" self.qw_25_72b = "Qwen/Qwen2.5-72B-Instruct" @@ -32,6 +33,8 @@ def __init__(self): # The following are the small model names for models provided via the OpenAI API service self.gpt_4o_mini = "gpt-4o-mini" self.o3_mini = "o3-mini" + self.gpt_45 = "gpt-4.5-preview" + self.gpt_41 = "gpt-4.1" self.model_dictionary = { "openai_models": [self.gpt_4o_mini, self.o3_mini], @@ -53,7 +56,11 @@ def non_thinking_llm_call(self, client, model, prompt): # DS api reasoner doesn't send think tags so no need for this function. # Deepinfra thinking models send these tags so this function is needed. def thinking_llm_call(self, client, model, prompt): - output = self.llm_call(client=client, model=model, prompt=prompt) + output_content = client.chat.completions.create(model=model, + messages=[{"role": "user", "content": prompt}], + stream=False + ) + output = output_content.choices[0].message.content separated_string = re.split(r"()", output) separated_string_thoughts = re.split(r"()", separated_string[0]) separated_string_output = separated_string[2] @@ -63,7 +70,7 @@ def thinking_llm_call(self, client, model, prompt): def llm_call(self, client, model, prompt): output = "" thoughts = "" - if (model==self.ds_r1) or (model==self.ds_distil_llama_70b): + if (model==self.ds_r1) or (model==self.ds_distil_llama_70b) or (model==self.ds_r1_turbo): output, thoughts = self.thinking_llm_call(client, model, prompt) else: output = self.non_thinking_llm_call(client, model, prompt) diff --git a/llm_qa.py b/llm_qa.py index 1c2192a..5199aa9 100644 --- a/llm_qa.py +++ b/llm_qa.py @@ -3,35 +3,13 @@ import os import json -import matplotlib.pyplot as plt import planner # Comment out any function calls within this. from openai import OpenAI - -########### ============ Global initializations ====================== ########## -domain_folder_list = os.listdir('apla-planner/generated_pddls_deepseek/dataset/domains') -problem_folder_list = os.listdir('apla-planner/generated_pddls_deepseek/dataset/problems') -client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) -client_deepseek = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") -client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai") -scenario_domain_and_problem_data = planner.retrieve_womdr_domain_problem_data() - -model_dictionary = { - "openai_models": ["gpt-4o-mini", "o3-mini"], - "deepinfra_models": ["meta-llama/Meta-Llama-3.1-8B-Instruct", - "microsoft/phi-4", - "Qwen/Qwen2.5-7B-Instruct" - ] -} - -# Generate two lists - domain file list and problem file list for a single scenario -# Reuse code in terms of classes and functions and - -model_outputs = {} -existing_grades = {} -exp_run_qa_scores = [] +from matplotlib import pyplot as plt ######## ================= LLM API calls ====================== ########### def openai_call(model_name, prompt): + client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) output = client_oai.chat.completions.create(model=model_name, messages=[{"role": "user", "content": prompt}], stream=False @@ -40,6 +18,7 @@ def openai_call(model_name, prompt): return output_content def deepinfra_call(model_name, prompt): + client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai") output = client_deepinfra.chat.completions.create(model=model_name, messages=[{"role": "user", "content": prompt}], stream=False @@ -48,6 +27,7 @@ def deepinfra_call(model_name, prompt): return output_content def deepseek_call(model_name, prompt): + client_deepseek = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") output = client_deepseek.chat.completions.create(model=model_name, messages=[{"role": "user", "content": prompt}], stream=False @@ -96,7 +76,8 @@ def grade_openai_deepinfra_models_one_interaction(model_dictionary, problem_path, current_plan, scenario_id, - interaction_id): + interaction_id, + scenario_domain_and_problem_data): #### Step 1: Generate the PDDL prompts ======================= ######### context = scenario_domain_and_problem_data[scenario_id]["Context"] @@ -110,64 +91,119 @@ def grade_openai_deepinfra_models_one_interaction(model_dictionary, pddl_problem = file_problem.readlines() direct_prompt = f""" - Here is some information about an autonomous vehicle scenario: + I want you to think step by step carefully and answer questions about autonomous vehicle test scenarios. Each question + has a descriptive answer. + + Here is some contextual information about one specific scenario: {context} - Answer the following question: + Now, given all of this information, think step by step and answer the following question: {question} - Think step by step. Show your reasoning and answer the question. - + Write a short answer only. Think step by step carefully and show your reasoning and how you reached a solution. """ + + #create a set of variables called + #direct_prompt_cot_2shot, direct_prompt_cot_4shot, direct_prompt_cot_6shot, + #direct_prompt_cot_8shot (#shot refers to number of interactions(qa pair) within the same scenario ID/context) + #(chain of thought) before asking the question + #provide examples of q and a pairs. obtain the pairs + #from the results of the parse script. scenario using + #for examples should not be used for evaluation. + # + # + # + #how to create these examples: + #direct_prompt = f""" + # I want you to answer some questions about an autonomous vehicle test scenario. Here are some examples for some scenarios: + # + # + # + # Here is some information about an autonomous vehicle scenario: + # {context} + + # Answer the following question: + # {question} + + # Think step by step. Show your reasoning and answer the question. + + # """ pddl_prompt = f""" - Here is some context about the test scenario: + I want you to think step by step carefully and answer questions about autonomous vehicle test scenarios. Each question + has a descriptive answer. + + Here is some contextual information about one specific scenario: {context} - Here is some PDDL domain data: + Here is some generated PDDL (Planning Domain Definition Language) domain data corresponding to this contextual information. This information + is to give you a more specific, explicit sense of the logic behind some of the car driving behaviors + in this scenario: {pddl_domain} - Here is the PDDL problem statement: + Each scenario consists of interactions between vehicles and intentionality regarding driving behavior intent of vehicles. + Regarding one such interaction, here is a PDDL problem statement generated. Connect it carefully with the + domain information given above. + + Here is the PDDL problem data: {pddl_problem} - I ran this through a planner and got the following result: + Since this data is in the PDDL format, I ran this through a planner which carried out + breadth first search to try and answer the PDDL problem data above in light of the PDDL domain + data and got the following result: {current_plan} - Think step by step and answer the following question: + Now, given all of this information, think step by step and answer the following question: {question} - Write a short 2 sentence answer only. Show your reasoning. + Write a short answer only. Think step by step carefully and show your reasoning and how you reached a solution. """ #### Step 2: Generate the model grades and add them to the dictionary for model_family in model_dictionary.keys(): if model_family=="openai_models": - for model_name in model_dictionary[model_family]: + for model_name in model_dictionary[model_family].keys(): + print("Model name is {}".format(model_name)) grading_prompt = prepare_grading_prompt(context=context, question=question, answer=answer, model_output=openai_call(model_name=model_name, prompt=pddl_prompt)) - grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) + grading_output = eval(deepseek_call(model_name="deepseek-chat", prompt=grading_prompt)) + print(grading_output) existing_grades[scenario_id][interaction_id].setdefault( - model_family+"_"+model_name+"_with_plan", grading_output + model_family+"_"+model_name+"_modelname", grading_output ) - existing_grades[scenario_id][interaction_id].setdefault("problem_score_avg", ((grading_output["Correctness score"] + grading_output["Faithfulness score"])/2)) + avg_score = (int(grading_output["Correctness score"]) + int(grading_output["Faithfulness score"]))/2 + + existing_grades[scenario_id][interaction_id][model_family+"_"+model_name+"_modelname"].setdefault("problem_score_avg", (str(avg_score))) + model_dictionary[model_family][model_name].append((str(avg_score))) elif model_family=="deepinfra_models": - for model_name in model_dictionary[model_family]: + for model_name in model_dictionary[model_family].keys(): + print("Model name is {}".format(model_name)) grading_prompt = prepare_grading_prompt(context=context, question=question, - answer=answer, model_output=deepinfra_call(model_name=model_name, prompt=pddl_prompt)) + answer=answer, model_output=deepinfra_call(model_name=model_name, prompt=pddl_prompt)) #when creating new var replace pddl prompt w my var name (ie 4shot) grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) existing_grades[scenario_id][interaction_id].setdefault( - model_family+"_"+model_name+"_with_plan", grading_output + model_family+"_"+model_name+"_with_plan", grading_output #replace _with_plan with _for_(insert variable name) ) existing_grades[scenario_id][interaction_id].setdefault("problem_score_avg", ((grading_output["Correctness score"] + grading_output["Faithfulness score"])/2)) - + print("Retrieving grades") -def pddl_response_and_answer_questions(domain_path, problem_path, current_plan, eval_folder): +def pddl_response_and_answer_questions(domain_path, + problem_path, + current_plan, + eval_folder, + scenario_domain_and_problem_data, + existing_grades, + model_dictionary): # Parse through the preprocessed json data contained in parsed_womdr_data/ + print(f"Data is {scenario_domain_and_problem_data}") for scenario_id in scenario_domain_and_problem_data.keys(): existing_grades.setdefault(scenario_id, {}) for interaction_id in scenario_domain_and_problem_data[scenario_id]["Interactions"].keys(): + print(f"Interaction considered is {interaction_id}") + print(f"Domain path is {domain_path}") + print(f"Problem path is {problem_path}") existing_grades[scenario_id].setdefault(interaction_id, {}) if (scenario_id in domain_path) and (interaction_id in problem_path): print("Scenario ID that matches is {}".format(scenario_id)) @@ -176,29 +212,50 @@ def pddl_response_and_answer_questions(domain_path, problem_path, current_plan, print("Evaluation file is {}".format(eval_complete_path)) ##### ===================== Automatic model evaluation with LLM grades on outputs ============== ######### - grade_openai_deepinfra_models_one_interaction(model_dictionary=model_dictionary, + existing_grades = grade_openai_deepinfra_models_one_interaction(model_dictionary=model_dictionary, existing_grades=existing_grades, domain_path=domain_path, problem_path=problem_path, current_plan=current_plan, scenario_id=scenario_id, - interaction_id=interaction_id) + interaction_id=interaction_id, + scenario_domain_and_problem_data=scenario_domain_and_problem_data) #Ensure that this json file by the name grades/deepseek_grades.json exists first. + print("Editing grades") with open("grades/deepseek_grades.json", 'w') as grade_file: with open(eval_complete_path, 'r') as eval_file: data = json.load(eval_file) existing_grades[scenario_id][interaction_id].setdefault("LLM_eval_problem_grade", data["Problem coverage"]["Grade"]) existing_grades[scenario_id][interaction_id].setdefault("LLM_eval_context_word_count", data["average_context_sentence_word_count"]) qa_interaction_score = existing_grades[scenario_id][interaction_id]["problem_score_avg"]*existing_grades[scenario_id][interaction_id]["LLM_eval_problem_grade"] - existing_grades[scenario_id][interaction_id].setdefault("qa_interaction_score", qa_interaction_score) - exp_run_qa_scores.append(qa_interaction_score) + existing_grades[scenario_id][interaction_id].setdefault("qa_interaction_score", qa_interaction_score) print("Existing grades is given by {}".format(existing_grades)) json.dump(existing_grades, grade_file, indent=4) grade_file.close() -def main(): +def run_evaluations(): # Recover the PDDL domain file, PDDL problem file for a particular scenario and plan file. + domain_folder_list = os.listdir('apla-planner/generated_pddls_deepseek/dataset/domains') + problem_folder_list = os.listdir('apla-planner/generated_pddls_deepseek/dataset/problems') + scenario_domain_and_problem_data = planner.retrieve_womdr_domain_problem_data() + + model_dictionary = { + "openai_models": { + "o3-mini": [] + }, + "deepinfra_models": { + "meta-llama/Meta-Llama-3.1-8B-Instruct": [] + } + } + + # Generate two lists - domain file list and problem file list for a single scenario + # Reuse code in terms of classes and functions and + + model_outputs = {} + existing_grades = {} + + for scenario_folder in domain_folder_list: #Scores for multiple problems (where each problem corresponds to one interaction) within one scenario #These grades add up to help us evaluate across all scenarios @@ -219,8 +276,8 @@ def main(): for problem_file_name in problems_within_scenario: # If PDDL problem file has been found, then open the plan file and find out the evaluations. # There should a plan file by the name of plan_set.json in each problem folder. - # Run this after the PDDL problem generation and the plan generation has been done. - if ".pddl" in problem_file_name: + # Run this after the PDDL problem generation and the plan generation has been done. + if ".pdd"==problem_file_name[-5:-1]: pddlproblem_file_name = problem_file_name print("PDDL problem file name is {}".format(pddlproblem_file_name)) print("problem file name is {}".format(pddlproblem_file_name)) @@ -229,7 +286,7 @@ def main(): problem_full_path = "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_folder+"/"+pddlproblem_file_name domain_full_path = "apla-planner/generated_pddls_deepseek/dataset/domains/"+scenario_folder+"/"+domains_within_scenario[0] - planfile_full_path = "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_folder+"/"+plan_file_name + planfile_full_path = "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_folder+"/"+pddlproblem_file_name+"_"+plan_file_name eval_folder = "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_folder+"/" with open(planfile_full_path, 'r') as plan_file: @@ -237,15 +294,23 @@ def main(): try: current_problem_plan = plan_data[pddlproblem_file_name] print("Current problem plan is {}".format(current_problem_plan)) - pddl_response_and_answer_questions(domain_path=domain_full_path, + existing_grades = pddl_response_and_answer_questions(domain_path=domain_full_path, problem_path=problem_full_path, - current_plan=current_problem_plan, eval_folder=eval_folder) + current_plan=current_problem_plan, + eval_folder=eval_folder, + scenario_domain_and_problem_data=scenario_domain_and_problem_data, + model_dictionary=model_dictionary, + existing_grades=existing_grades) except: continue else: pddlproblem_file_name = "" - print("For this exp run, the final qa scores are {}".format(exp_run_qa_scores)) - - plt.bar([i for i in range(len(exp_run_qa_scores))], exp_run_qa_scores) - plt.show() + # After everything is done, plot a single bar chart per model output + for model_provider in model_dictionary.keys(): + for model in model_dictionary[model_provider].keys(): + plt.bar([i for i in range(len(model_dictionary[model_provider][model]))], [float(model_dictionary[model_provider][model][i]) for i in range(len(model_dictionary[model_provider][model]))]) + plt.title(f"{model}") + plt.xlabel("Interaction number (across all scenarios)") + plt.ylabel("Average correctness/faithfulness scores") + plt.show() diff --git a/llm_qa_direct_only.py b/llm_qa_direct_only.py index 7756356..17291c8 100644 --- a/llm_qa_direct_only.py +++ b/llm_qa_direct_only.py @@ -3,16 +3,28 @@ import os import json -import matplotlib.pyplot as plt -import planner # Comment out any function calls within this. +import matplotlib.pyplot as plt +import planner from openai import OpenAI +from parsed_data_retrieval import retrieve_parsed_data +import argparse +from tqdm import tqdm + ########### ============ Global initializations ====================== ########## parsed_file_list = os.listdir("parsed_womdr_data/") client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) -client_deepseek = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") +#client_deepseek = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com") client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai") -scenario_domain_and_problem_data = planner.retrieve_womdr_domain_problem_data() +scenario_domain_and_problem_data = retrieve_parsed_data() + +parser = argparse.ArgumentParser(prog="Cruzway Reasoner 2.0", + description="Evaluate LLMs for autonomous vehicle test scenario reasoning") +parser.add_argument("-ns", "--nshot", type=str) +parser.add_argument("-sindex", "--scenario_index", type=int) +parser.add_argument("-gt", "--ground_truth", default="True", type=str) +parser.add_argument("-basic", "--enable_basic_refined_prompt", default="False", type=str) +parser.add_argument("-sqa", "--enable_lecturing_refined_prompt", default="False", type=str) # The following are model names for DeepInfra provided models # "deepseek-ai/DeepSeek-V3" @@ -31,27 +43,31 @@ # "o3-mini" model_dictionary = { - "openai_models": { - "gpt-4o-mini": [] - }, - "deepinfra_models":{ - - } -} - + "openai_models": { + "gpt-4o-mini": [] + }, + "deepinfra_models": { + } + } # Generate two lists - domain file list and problem file list for a single scenario # Reuse code in terms of classes and functions and model_outputs = {} -existing_grades = {} scenario_qa_score = {} ######## ================= LLM API calls ====================== ########### def openai_call(model_name, prompt): - output = client_oai.chat.completions.create(model=model_name, - messages=[{"role": "user", "content": prompt}], - stream=False - ) + if(model_name=="o4-mini"): + output = client_oai.chat.completions.create(model=model_name, + messages=[{"role": "user", "content": prompt}], + stream=False, + reasoning_effort="high" + ) + else: + output = client_oai.chat.completions.create(model=model_name, + messages=[{"role": "user", "content": prompt}], + stream=False + ) output_content = output.choices[0].message.content return output_content @@ -63,13 +79,6 @@ def deepinfra_call(model_name, prompt): output_content = output.choices[0].message.content return output_content -def deepseek_call(model_name, prompt): - output = client_deepseek.chat.completions.create(model=model_name, - messages=[{"role": "user", "content": prompt}], - stream=False - ) - output_content = output.choices[0].message.content - return output_content ################# ============== QA prompts ===================== def generate_qa_prompt(context, question, answer, prompt_type="4shot"): direct_prompt = f""" @@ -81,139 +90,439 @@ def generate_qa_prompt(context, question, answer, prompt_type="4shot"): Think step by step. Show your reasoning and answer the question. + """ + #Reference for this prompt is scenario ID 10471914b8bb79a1 using interactions 0 and 7 + direct_cot_prompt_2shot = f""" + I want you to answer some questions from the world of autonomous vehicle testing. + + First, some information about the context: "Can you describe the type of intersection present in the current driving scenario? The intersection is a 4 way intersection.What is the status of the traffic light for the ego agent at this moment? The traffic light for the ego agent is green.Is there any information about the presence of stop signs, crosswalks, or speed bumps in the current scenario? There is no information about stop signs, crosswalks, or speed bumps in the current scenario.What is the ego agent's current action within the intersection? The ego agent is turning left and exiting the intersection.Could you specify the ego agent's current speed and whether it is increasing or decreasing? The ego agent's current speed is 13 m/s and it is accelerating.How does the current traffic light affect the ego agent's movement? The traffic light is green, which allows the ego agent to proceed.What type of agent is surrounding agent #0 and what is its current motion status? Surrounding agent #0 is a vehicle and it is accelerating.How is surrounding agent #0 positioned relative to the ego agent and the intersection? Surrounding agent #0 is on the left of the ego agent, in front of it, and is departing from the intersection.What is the current speed of surrounding agent #0? The current speed of surrounding agent #0 is 8 m/s.What type of agent is surrounding agent #1 and what is its current motion status? Surrounding agent #1 is a vehicle and it is not moving.How is surrounding agent #1 positioned relative to the ego agent and the intersection? Surrounding agent #1 is on the left of the ego agent, in front of it, and is heading towards the intersection.What type of agent is surrounding agent #2 and what is its current motion status? Surrounding agent #2 is a vehicle and it is decelerating.How is surrounding agent #2 positioned relative to the ego agent and the intersection? Surrounding agent #2 is on the left of the ego agent, in front of it, and is heading towards the intersection.What is the current speed of surrounding agent #2? The current speed of surrounding agent #2 is 3 m/s.What type of agent is surrounding agent #4 and what is its current motion status? Surrounding agent #4 is a vehicle and it is not moving.How is surrounding agent #4 positioned relative to the ego agent and the intersection? Surrounding agent #4 is on the left of the ego agent, behind it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #5 and what is its current motion status? Surrounding agent #5 is a vehicle and it is not moving.How is surrounding agent #5 positioned relative to the ego agent and the intersection? Surrounding agent #5 is on the left of the ego agent, behind it, and is departing from the intersection.What type of agent is surrounding agent #6 and what is its current motion status? Surrounding agent #6 is a vehicle and it is moving at a constant speed.How is surrounding agent #6 positioned relative to the ego agent and the intersection? Surrounding agent #6 is on the left of the ego agent, behind it, and is heading towards the intersection.Is there any traffic control affecting surrounding agent #6? Surrounding agent #6 is approaching a crosswalk 4 meters ahead.What type of agent is surrounding agent #7 and what is its current motion status? Surrounding agent #7 is a vehicle and it is not moving.How is surrounding agent #7 positioned relative to the ego agent and the intersection? Surrounding agent #7 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #8 and what is its current motion status? Surrounding agent #8 is a vehicle and it is not moving.How is surrounding agent #8 positioned relative to the ego agent and the intersection? Surrounding agent #8 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent." + + Here is the first example of a question being answered: + Question: "What interactions are expected between the ego agent and surrounding agent #0?" + Answer: "Surrounding agent #0 will have no interaction with the ego agent as it is departing from the intersection and their paths do not conflict." + + Here is the second and last example of a question being answered: + Question: "What is the ego agent's plan in the immediate future?" + Answer: "The ego agent intends to complete its left turn and exit the intersection. It will proceed with the turn as the traffic light is green and it has the right of way. Surrounding agents #1 and #2 will yield to the ego agent, and surrounding agent #6 will likely stop at the crosswalk, so the ego agent does not need to alter its course in response to these agents." + + When analyzing driving scenarios, please follow points 1 through 11 written below: + 1) Don't misinterpret or incorrectly assess the key scenario dynamics, agent positions, motion statuses, relative movements, spatial relationships, or right-of-way. Because of the aforementioned, do not incorrectly assume no interaction. + 2) Don't make unnecessary or speculative assumptions regarding potential interactions, collisions, evasive maneuvers, or conflicts not supported by the explicit context or ground truth. Prioritize explicit details in the context and ground truth. + 3) Don't elaborate on hypothetical, potential, or unwarranted interactions, trajectory adjustments, yielding, or collision avoidance unless directly stated in the scenario or ground truth. Prioritize explicit details in the context and ground truth. + 4) Don't overcomplicate the reasoning, introduce extraneous details, or speculate about future events, monitoring requirements, or passive observation that are not specified or relevant to the scenario. Prioritize explicit details in the context and ground truth. + 5) Don't provide overly detailed, verbose, redundant, or partially off-topic explanations; keep answers concise and focused on the central point. Prioritize key takeaways. + 6) Don't add information, nuanced explanations, or background reasoning beyond what is present in and supported by the ground truth. Prioritize explicit details in the ground truth. + 7) Don't fail to identify, mention, or precisely describe critical details explicitly present in the ground truth (e.g., yielding, overtaking, leading/following dynamics, acceleration, established right-of-way, explicit actions). Prioritize explicit details in the ground truth. + 8) Don't underestimate or overemphasize the importance of specific interactions compared to what is stated in the ground truth. Prioritize explicit details in the context and ground truth. + 9) Don't deviate from the ground truth by introducing unsupported, irrelevant, or misleading information, or by omitting key details prescribed in the scenario or answer. Prioritize explicit details in the context and ground truth. + 10) Don't present logical but incomplete, imprecise, or misaligned conclusions (with respect to the ground truth). Prioritize explicit details in the context and ground truth. + 11) Don't discuss possible but nonexistent proximity risks, interaction points, or minimal interactions not warranted by the scenario. Prioritize explicit details in the context and ground truth. + Please follow points 1 through 11 listed above when analyzing and responding to driving scenarios. + """ + #using interactions 0 2 4 6 direct_cot_prompt_4shot = f""" - I want you to answer some questions from the world of autonomous vehicle testing. + I want you to answer some questions from the world of autonomous vehicle testing. - Here are some examples of questions being answered: - First, some information about the context: "Can you describe the current road configuration in terms of lanes? The road has three lanes.What traffic controls are present in the current driving scene? There are no traffic controls present in the current driving scene.What is the ego agent's current velocity? The ego agent's current speed is 6 meters per second.Is the ego agent's speed constant or changing? The ego agent is accelerating.Could you specify the ego agent's current lane position? The ego agent is on the first lane from the right.What is the ego agent's current direction of travel? The ego agent is heading in the same direction as its current lane.What type of agent is surrounding agent #0? Surrounding agent #0 is a vehicle.How fast is surrounding agent #0 moving at the moment? Surrounding agent #0's current speed is 5 meters per second.What is the motion status of surrounding agent #0? Surrounding agent #0 is accelerating.Where is surrounding agent #0 in relation to the ego agent? Surrounding agent #0 is 4 meters on the left and 1 meter in front of the ego agent.What direction is surrounding agent #0 facing compared to the ego agent? Surrounding agent #0 is heading in the same direction as the ego agent.What type of agent is surrounding agent #1? Surrounding agent #1 is a vehicle.What is the current speed of surrounding agent #1? Surrounding agent #1's current speed is 4 meters per second.Is surrounding agent #1 accelerating or maintaining its speed? Surrounding agent #1 is moving at a constant speed.Can you describe the position of surrounding agent #1 relative to the ego agent? Surrounding agent #1 is 24 meters behind and 3 meters on the left of the ego agent.In which direction is surrounding agent #1 moving with respect to the ego agent? Surrounding agent #1 is heading in the same direction as the ego agent.What type of agent is surrounding agent #3? Surrounding agent #3 is a vehicle.What is the current velocity of surrounding agent #3? Surrounding agent #3 is not moving.Where is surrounding agent #3 located in relation to the ego agent? Surrounding agent #3 is 4 meters in front and 4 meters on the right of the ego agent.What direction is surrounding agent #3 facing in relation to the ego agent? Surrounding agent #3 is heading in the same direction as the ego agent.What type of agent is surrounding agent #4? Surrounding agent #4 is a vehicle.What is the motion status of surrounding agent #4? Surrounding agent #4 is not moving.Can you describe the position of surrounding agent #4 with respect to the ego agent? Surrounding agent #4 is 9 meters on the right and 1 meter behind the ego agent.In which direction is surrounding agent #4 heading compared to the ego agent? Surrounding agent #4 is heading the opposite direction as the ego agent.What type of agent is surrounding agent #5? Surrounding agent #5 is a vehicle.Is surrounding agent #5 currently in motion? Surrounding agent #5 is not moving.Where is surrounding agent #5 situated in relation to the ego agent? Surrounding agent #5 is 11 meters in front and 2 meters on the right of the ego agent.What direction is surrounding agent #5 facing with respect to the ego agent? Surrounding agent #5 is heading right of the ego agent.What type of agent is surrounding agent #6? Surrounding agent #6 is a vehicle.What is the current speed of surrounding agent #6? Surrounding agent #6 is not moving.Can you describe the position of surrounding agent #6 relative to the ego agent? Surrounding agent #6 is 14 meters in front and 7 meters on the right of the ego agent.In which direction is surrounding agent #6 moving with respect to the ego agent? Surrounding agent #6 is heading right of the ego agent." - - Question: "What interactions are anticipated between the ego agent and surrounding agent #0?" - Answer: "Surrounding agent #0 will overtake the ego agent as it is accelerating and will be further ahead in the future." + First, some information about the context: "Can you describe the type of intersection present in the current driving scenario? The intersection is a 4 way intersection.What is the status of the traffic light for the ego agent at this moment? The traffic light for the ego agent is green.Is there any information about the presence of stop signs, crosswalks, or speed bumps in the current scenario? There is no information about stop signs, crosswalks, or speed bumps in the current scenario.What is the ego agent's current action within the intersection? The ego agent is turning left and exiting the intersection.Could you specify the ego agent's current speed and whether it is increasing or decreasing? The ego agent's current speed is 13 m/s and it is accelerating.How does the current traffic light affect the ego agent's movement? The traffic light is green, which allows the ego agent to proceed.What type of agent is surrounding agent #0 and what is its current motion status? Surrounding agent #0 is a vehicle and it is accelerating.How is surrounding agent #0 positioned relative to the ego agent and the intersection? Surrounding agent #0 is on the left of the ego agent, in front of it, and is departing from the intersection.What is the current speed of surrounding agent #0? The current speed of surrounding agent #0 is 8 m/s.What type of agent is surrounding agent #1 and what is its current motion status? Surrounding agent #1 is a vehicle and it is not moving.How is surrounding agent #1 positioned relative to the ego agent and the intersection? Surrounding agent #1 is on the left of the ego agent, in front of it, and is heading towards the intersection.What type of agent is surrounding agent #2 and what is its current motion status? Surrounding agent #2 is a vehicle and it is decelerating.How is surrounding agent #2 positioned relative to the ego agent and the intersection? Surrounding agent #2 is on the left of the ego agent, in front of it, and is heading towards the intersection.What is the current speed of surrounding agent #2? The current speed of surrounding agent #2 is 3 m/s.What type of agent is surrounding agent #4 and what is its current motion status? Surrounding agent #4 is a vehicle and it is not moving.How is surrounding agent #4 positioned relative to the ego agent and the intersection? Surrounding agent #4 is on the left of the ego agent, behind it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #5 and what is its current motion status? Surrounding agent #5 is a vehicle and it is not moving.How is surrounding agent #5 positioned relative to the ego agent and the intersection? Surrounding agent #5 is on the left of the ego agent, behind it, and is departing from the intersection.What type of agent is surrounding agent #6 and what is its current motion status? Surrounding agent #6 is a vehicle and it is moving at a constant speed.How is surrounding agent #6 positioned relative to the ego agent and the intersection? Surrounding agent #6 is on the left of the ego agent, behind it, and is heading towards the intersection.Is there any traffic control affecting surrounding agent #6? Surrounding agent #6 is approaching a crosswalk 4 meters ahead.What type of agent is surrounding agent #7 and what is its current motion status? Surrounding agent #7 is a vehicle and it is not moving.How is surrounding agent #7 positioned relative to the ego agent and the intersection? Surrounding agent #7 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #8 and what is its current motion status? Surrounding agent #8 is a vehicle and it is not moving.How is surrounding agent #8 positioned relative to the ego agent and the intersection? Surrounding agent #8 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent." - Question: "Can you predict the interaction between the ego agent and surrounding agent #4?" - Answer: "There will be no interaction between the ego agent and surrounding agent #4 as they are heading in opposite directions and not affecting each other's path." + Here is the first example of a question being answered: + Question: "What interactions are expected between the ego agent and surrounding agent #0?" + Answer: "Surrounding agent #0 will have no interaction with the ego agent as it is departing from the intersection and their paths do not conflict." - Question: "What is the ego agent's plan for the immediate future?" - Answer: "The ego agent intends to continue on its current path and lane while accelerating. It will overtake surrounding agent #3 and pass surrounding agents #5 and #6, as they are not moving. It will also be overtaken by surrounding agent #0, which is accelerating on the left side." + Here is the second example of a question being answered: + Question: "What is the nature of the interaction between the ego agent and surrounding agent #2?" + Answer: "Surrounding agent #2 will yield to the ego agent as it is decelerating and heading towards the intersection while the ego agent is exiting the intersection with a green light." - Question: "What will be the nature of the interaction between the ego agent and surrounding agent #6?" - Answer: "The ego agent will pass surrounding agent #6 since surrounding agent #6 is stationary and the ego agent is accelerating." + Here is the third example of a question being answered: + Question: "What interaction will occur between the ego agent and surrounding agent #6?" + Answer: "Surrounding agent #6 will yield to the ego agent as it is on the right of the intersection and is approaching a crosswalk, indicating it may need to stop, while the ego agent is actively exiting the intersection." - Given these examples now please have a look at the following new context and try to answer the following question: - Here is the context: {context} + Here is the fourth and last example of a question being answered: + Question: "What kind of interaction will take place between the ego agent and surrounding agent #8?" + Answer: "Surrounding agent #8 will have no interaction with the ego agent as it is not moving and is on the same side of the intersection as the ego agent." + + When analyzing driving scenarios, please follow points 1 through 8 written below: + 1) Don't speculate about potential or hypothetical interactions, maneuvers, conflicts, or yielding behavior that are not supported by the provided context or ground truth. Please prioritize explicit details in the context and ground truth. + 2) Don't overcomplicate the analysis with unnecessary details, verbose explanations, or elaborate on irrelevant possible actions or future trajectories not mentioned in the scenario. Please prioritize explicit details in the context. + 3) Don't misinterpret or incorrectly assess the key scenario dynamics, agent positions, motion statuses, relative movements, spatial relationships, or right-of-way. Because of the aforementioned, never assume no interaction. + 4) Don't introduce unwarranted scenarios (e.g., unnecessary speed or trajectory adjustments, collision risks, overtaking, or yielding) that detract from the actual dynamics and facts described. Please prioritize explicit details in the context. + 5) Don't omit or fail to align explanations with critical details or key points from the ground truth, especially regarding the presence or absence of interactions and required behaviors. Please prioritize explicit details in the context and ground truth. + 6) Don't offer conclusions or reasoning that contradict the context, such as stating there is (or is not) an interaction when the ground truth indicates the opposite, or introduce unsupported assumptions about right-of-way, vehicle intent, or agent relationships. Please prioritize explicit details in the ground truth. + 7) Don't add irrelevant commentary or speculative concerns that dilute the core answer. Please prioritize explicit details in the context and ground truth. + 8) Don't focus your answer on concise, accurate, and context-supported analysis aligned with the scenario's ground truth. Do not expand with hypothetical, unnecessary, or unsupported information. Please prioritize explicit details in the context and ground truth. + Please follow points 1 through 8 listed above when analyzing and responding to driving scenarios. + + """ + + #using interactions 0 1 2 3 5 7 + direct_cot_prompt_6shot = f""" + I want you to answer some questions from the world of autonomous vehicle testing. + + Here are some examples of questions being answered: + First, some information about the context: "Can you describe the type of intersection present in the current driving scenario? The intersection is a 4 way intersection.What is the status of the traffic light for the ego agent at this moment? The traffic light for the ego agent is green.Is there any information about the presence of stop signs, crosswalks, or speed bumps in the current scenario? There is no information about stop signs, crosswalks, or speed bumps in the current scenario.What is the ego agent's current action within the intersection? The ego agent is turning left and exiting the intersection.Could you specify the ego agent's current speed and whether it is increasing or decreasing? The ego agent's current speed is 13 m/s and it is accelerating.How does the current traffic light affect the ego agent's movement? The traffic light is green, which allows the ego agent to proceed.What type of agent is surrounding agent #0 and what is its current motion status? Surrounding agent #0 is a vehicle and it is accelerating.How is surrounding agent #0 positioned relative to the ego agent and the intersection? Surrounding agent #0 is on the left of the ego agent, in front of it, and is departing from the intersection.What is the current speed of surrounding agent #0? The current speed of surrounding agent #0 is 8 m/s.What type of agent is surrounding agent #1 and what is its current motion status? Surrounding agent #1 is a vehicle and it is not moving.How is surrounding agent #1 positioned relative to the ego agent and the intersection? Surrounding agent #1 is on the left of the ego agent, in front of it, and is heading towards the intersection.What type of agent is surrounding agent #2 and what is its current motion status? Surrounding agent #2 is a vehicle and it is decelerating.How is surrounding agent #2 positioned relative to the ego agent and the intersection? Surrounding agent #2 is on the left of the ego agent, in front of it, and is heading towards the intersection.What is the current speed of surrounding agent #2? The current speed of surrounding agent #2 is 3 m/s.What type of agent is surrounding agent #4 and what is its current motion status? Surrounding agent #4 is a vehicle and it is not moving.How is surrounding agent #4 positioned relative to the ego agent and the intersection? Surrounding agent #4 is on the left of the ego agent, behind it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #5 and what is its current motion status? Surrounding agent #5 is a vehicle and it is not moving.How is surrounding agent #5 positioned relative to the ego agent and the intersection? Surrounding agent #5 is on the left of the ego agent, behind it, and is departing from the intersection.What type of agent is surrounding agent #6 and what is its current motion status? Surrounding agent #6 is a vehicle and it is moving at a constant speed.How is surrounding agent #6 positioned relative to the ego agent and the intersection? Surrounding agent #6 is on the left of the ego agent, behind it, and is heading towards the intersection.Is there any traffic control affecting surrounding agent #6? Surrounding agent #6 is approaching a crosswalk 4 meters ahead.What type of agent is surrounding agent #7 and what is its current motion status? Surrounding agent #7 is a vehicle and it is not moving.How is surrounding agent #7 positioned relative to the ego agent and the intersection? Surrounding agent #7 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent.What type of agent is surrounding agent #8 and what is its current motion status? Surrounding agent #8 is a vehicle and it is not moving.How is surrounding agent #8 positioned relative to the ego agent and the intersection? Surrounding agent #8 is on the left of the ego agent, in front of it, and is on the same side of the intersection as the ego agent." + + Here is the first example of a question being answered: + Question: "What interactions are expected between the ego agent and surrounding agent #0?" + Answer: "Surrounding agent #0 will have no interaction with the ego agent as it is departing from the intersection and their paths do not conflict." + + Here is the second example of a question being answered: + Question: "How will the ego agent and surrounding agent #1 interact as they are both near the intersection?" + Answer: "Surrounding agent #1 will yield to the ego agent because the ego agent has the right of way with a green traffic light and is already exiting the intersection while surrounding agent #1 is not moving." + + Here is the third example of a question being answered: + Question: "What is the nature of the interaction between the ego agent and surrounding agent #2?" + Answer: "Surrounding agent #2 will yield to the ego agent as it is decelerating and heading towards the intersection while the ego agent is exiting the intersection with a green light." + + Here is the fourth example of a question being answered: + Question: "Can you describe the interaction between the ego agent and surrounding agent #4?" + Answer: "There will be no interaction between the ego agent and surrounding agent #4 as surrounding agent #4 is not moving and is on the same side of the intersection as the ego agent." + + Here is the fifth example of a question being answered: + Question: "What will be the interaction between the ego agent and surrounding agent #7?" + Answer: "Surrounding agent #7 will have no interaction with the ego agent as it is not moving and is on the same side of the intersection as the ego agent." + + Here is the sixth and last example of a question being answered: + Question: "What is the ego agent's plan in the immediate future?" + Answer: "The ego agent intends to complete its left turn and exit the intersection. It will proceed with the turn as the traffic light is green and it has the right of way. Surrounding agents #1 and #2 will yield to the ego agent, and surrounding agent #6 will likely stop at the crosswalk, so the ego agent does not need to alter its course in response to these agents." + + When analyzing driving scenarios, please follow points 1 through 11 written below: + 1) Don't speculate about potential or hypothetical interactions unless clearly indicated in the scenario. Please prioritize explicit details in the context. + 2) Don't provide overly detailed analyses or introduce unnecessary complexity not present in the ground truth. Please prioritize explicit details in the ground truth. + 3) Don't assume or infer yielding behavior, collisions, or path conflicts unless explicitly described. Please prioritize explicit details in the context and ground truth. + 4) Don't include information about right of way, speed bumps, or surrounding agents unless it is directly supported by the scenario and ground truth. Please prioritize explicit details in the context and ground truth. + 5) Don't misinterpret the relative positions, motion statuses, or the spatial/directional dynamics of the agents. Because of the aforementioned, never assume no interaction. Do not ignore differences in relative speed and position between agents. Do not state "no interaction" just because there is no immediate collision or lane change. Do not overlook that a decelerating ego agent falling behind a constant-speed leader is still an interaction. Do not disregard lead-follow dynamics as unimportant to agent interactions. Do not provide shallow or incomplete analysis of agent relationships. + 6) Don't overanalyze or focus on possible but unsupported outcomes or secondary details. Please prioritize explicit details in the context and ground truth. + 7) Don't omit key points stated in the ground truth, such as explicit yielding, leading/following dynamics, or the order of actions. Please analyze the potential of the aforementioned, specifically how decelerating and constant speed combined cause no interaction. + 8) Don't use vague conclusions like "minimal" or "negligible" interaction instead of directly stating "no interaction" where appropriate. Please prioritize explicit details in your reasoning. + 9) Don't address passive observation, secondary potential scenarios, or future interactions unless specified by the scenario. Please prioritize explicit details in the context. + 10) Don't ignore or fail to directly address the main interaction or key behavioral details described in the ground truth. Please prioritize explicit details in your reasoning. + 11) Don't keep responses concise, directly address the core scenario as described, and strictly adhere to the information provided in the ground truth. Please prioritize explicit details in the ground truth. + Please follow points 1 through 11 listed above when analyzing and responding to driving scenarios. + + """ - Here is the question: {question} - - """ if prompt_type=="4shot": return direct_cot_prompt_4shot - elif prompt_type=="direct": + elif prompt_type=="0shot": return direct_prompt + elif prompt_type=="2shot": + return direct_cot_prompt_2shot + elif prompt_type=="6shot": + return direct_cot_prompt_6shot + +################# ============== Prompt Refining Functions ===================== ############## +def prepare_refined_prompt_with_final_question(context, + question, + refining_model="gpt-4.1", + initial_prompt="", + enable_lecturing=False): + instructions_to_refine = f""" + I have an autonomous vehicle scenario, some context information and instructions on what kind of reasoning to not do: + {initial_prompt} + + Please rewrite the context and please rewrite the examples according to the following instructions: + * In the examples, please include the reasoning while closely following the guidelines mentioned previously. + * Please follow the guidelines constructively. + * Please provide detailed fine-grained reasoning. + * Please ensure in each example that the reasoning is written before the answer. + * Please ensure that the facts are consistent with the initial information. + * Think step by step. + """ + + instructions_to_refine_lecturing = f""" + I have an autonomous vehicle scenario, some context information and instructions on what kind of reasoning to not do: + {initial_prompt} + + Please generate a lecture transcript which explains the concepts mentioned in the scenario information provided above. Most importantly, please make it absolutely clear what the notion of an interaction actually means in this specific context and situation. + + Then please rewrite the context and please rewrite the examples according to the following instructions: + * In the examples given previously, please include the reasoning while closely following the guidelines mentioned previously. + * Please follow the guidelines constructively. + * Please provide detailed fine-grained reasoning. + * Please ensure in each example that the reasoning is written before the answer. + * Please ensure that the facts are consistent with the initial information. + * Think step by step. + + """ + if enable_lecturing==False: + refined_cot_prompt = openai_call(model_name=refining_model, prompt=instructions_to_refine) + elif enable_lecturing==True: + refined_cot_prompt = openai_call(model_name=refining_model, prompt=instructions_to_refine_lecturing) + + final_question = f""" + {refined_cot_prompt} + Given these examples now please have a look at the following new context and try to answer the following question pertaining to the new context: + Here is the context: {context} + + Here is the question: {question} + + Please ensure that the answers are clear, explanatory, yet concise. + """ + return final_question ################# ============= Grading via LLM as a judge prompts ================== ############### -def prepare_grading_prompt(context, question, answer, model_output): - grading_prompt = f""" - Here is some context about the test scenario: - {context} +def prepare_grading_prompt(context, question, answer, model_output, ground_truth_eval=False): + ai_attempt = f""" + This was the attempt by an AI for this question + {model_output} + """ + + if ground_truth_eval==True: + grading_prompt = f""" + Here is some context about the test scenario: + {context} - This question was asked with regards to this context: - {question} + This question was asked with regards to this context: + {question} - This is the ground truth answer: - {answer} - - This was the attempt by an AI for this question - {model_output} - - Grade this answer on the following aspects: - 1. The correctness of the AI answer with respect to the ground truth answer. Give it a score between 1 to 10. - Explain why this score was given by you in detail. - 2. The faithfulness of the reasoning. Are the conclusions drawn in the answer given by the AI consistent with its reasoning? Here, give it a score between 1 to 10. - Explain why this score was given by you in detail. - - Format the answer in a python dictionary format like this. - : - "Correctness score": "", - "Correctness explanation": "", - "Faithfulness score": "", - "Faithfulness explanation": "", - - - Don't write anything else. Nothing else, nothing else, nothing else. - Please only write it in the format requested. - """ + This is an answer: + {answer} + + Please grade the answer above in accordance with the following aspects: + * Write the correctness of the answer with respect to the ground truth answer. Give it a score between 1 to 10. + * Explain why this score was given by you in detail. + * Please only consider the context information mentioned above that is relevant to the question. Please explain why you believe it is relevant. + * Please think step by step. + * Please do not deduct points for specifics unless it is important to effectively answer the question. Please explain why you believe it is important. + * Please format the answer in a python dictionary format like this: + : + "Correctness score": "", + "Correctness explanation": "", + "Updated Answer": + + + Don't write anything else. Nothing else, nothing else, nothing else. + Please only write it in the format requested. + + """ + else: + grading_prompt = f""" + Here is some context about the test scenario: + {context} + + This question was asked with regards to this context: + {question} + + This is the ground truth answer: + {answer} + + {ai_attempt} + + Please grade the AI answer above in accordance with the following aspects: + * Write the correctness of the answer with respect to the ground truth answer. Give it a score between 1 to 10. + * Explain why this score was given by you in detail. + * Please only consider the context information mentioned above that is relevant to the question. Please explain why you believe it is relevant. + * Please think step by step. + * Please do not deduct points for specifics unless it is important to effectively answer the question. Please explain why you believe it is important. + * Please format the answer in a python dictionary format like this: + : + "Correctness score": "", + "Correctness explanation": "" + + + Don't write anything else. Nothing else, nothing else, nothing else. + Please only write it in the format requested. + + """ return grading_prompt +############### =============== Generate prompts, AI and LLM as a judge responses ============== ######### +def run_llm_evals(context, question, answer, + initial_prompt, + question_gen_model, + eval_model_family, + eval_model, + scenario_id, + interaction_id, + context_word_count, + existing_grades, + ground_truth, + enable_basic_refined_prompt, + enable_lecturing_refined_prompt): + + # LLM as a judge grade generation + if ground_truth==True: + grading_prompt= prepare_grading_prompt(context=context, + question=question, + answer=answer, + model_output="", + ground_truth_eval=True) + grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) + + # Preparation of the grades file. + existing_grades[scenario_id][interaction_id].setdefault( + "Ground Truth Grades", grading_output + ) + correctness = int(grading_output["Correctness score"]) + + existing_grades[scenario_id][interaction_id]["Ground Truth Grades"].setdefault("Word Count", (str(context_word_count))) + + else: + # Refining the CoT examples given in a prompt. + if enable_basic_refined_prompt==True: + final_prompt = prepare_refined_prompt_with_final_question(context=context, + question=question, + refining_model=question_gen_model, + initial_prompt=initial_prompt, + enable_lecturing=False) + elif enable_lecturing_refined_prompt==True: + final_prompt = prepare_refined_prompt_with_final_question(context=context, + question=question, + refining_model=question_gen_model, + initial_prompt=initial_prompt, + enable_lecturing=True) + else: final_prompt=initial_prompt + + # LLM response involving specific prompting method along with a question and the corresponding context. + ai_response = openai_call(model_name=eval_model, prompt=final_prompt) + + # Send the LLM response to the LLM as a judge. + grading_prompt = prepare_grading_prompt(context=context, + question=question, + answer=answer, + model_output=ai_response) + grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) + + # Recording the grades generated by the LLM as a judge. + existing_grades[scenario_id][interaction_id].setdefault(f"LLM {eval_model} Response", ai_response) + + # Preparation of the grades file. + existing_grades[scenario_id][interaction_id].setdefault( + eval_model_family+"_"+eval_model+"_modelname", grading_output + ) + correctness = int(grading_output["Correctness score"]) + + existing_grades[scenario_id][interaction_id][eval_model_family+"_"+eval_model+"_modelname"].setdefault("Word Count", (str(context_word_count))) + + # Update the model-wise correctness outputs for creating bar charts at the end. + model_dictionary[eval_model_family][eval_model].append(correctness) ############### =============== Evaluating Interactions ================ ############## -def grade_openai_deepinfra_models_one_interaction(model_dictionary, - existing_grades, +def grade_openai_deepinfra_models_one_interaction(model_dictionary, scenario_id, interaction_id, - prompt_type): + prompt_type, + existing_grades, + ground_truth, + enable_basic_refined_prompt, + enable_lecturing_refined_prompt): - #### Step 1: Generate the PDDL prompts ======================= ######### + # Retrieving preprocessed data context = scenario_domain_and_problem_data[scenario_id]["Context"] question = scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["problem_data"] answer = scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["answer_data"] + context_word_count = scenario_domain_and_problem_data[scenario_id]["Word Count"] + # Initial prompt generation given QA data for one interaction. generated_prompt = generate_qa_prompt(context, question, answer, prompt_type) - #### Step 2: Generate the model grades and add them to the dictionary for model_family in model_dictionary.keys(): if model_family=="openai_models": for model_name in model_dictionary[model_family]: - grading_prompt = prepare_grading_prompt(context=context, question=question, - answer=answer, model_output=openai_call(model_name=model_name, prompt=generated_prompt)) - grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) - existing_grades[scenario_id][interaction_id].setdefault( - model_family+"_"+model_name+"_modelname", grading_output - ) - avg_score = (int(grading_output["Correctness score"]) + int(grading_output["Faithfulness score"]))/2 - existing_grades[scenario_id][interaction_id][model_family+"_"+model_name+"_modelname"].setdefault("problem_score_avg", (str(avg_score))) - model_dictionary[model_family][model_name].append(avg_score) + # Generate (1) refined prompts. (2) LLM responses. (3) LLM as a judge grades using the function below: + run_llm_evals(context=context, question=question, answer=answer, initial_prompt=generated_prompt, + question_gen_model="gpt-4.1", + eval_model_family=model_family, + eval_model=model_name, + scenario_id=scenario_id, + interaction_id=interaction_id, + existing_grades=existing_grades, + context_word_count=context_word_count, + ground_truth=ground_truth, + enable_basic_refined_prompt=enable_basic_refined_prompt, + enable_lecturing_refined_prompt=enable_lecturing_refined_prompt) elif model_family=="deepinfra_models": for model_name in model_dictionary[model_family]: - grading_prompt = prepare_grading_prompt(context=context, question=question, - answer=answer, model_output=deepinfra_call(model_name=model_name, prompt=generated_prompt)) - grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt)) - existing_grades[scenario_id][interaction_id].setdefault( - model_family+"_"+model_name+"_modelname", grading_output - ) - avg_score = (int(grading_output["Correctness score"]) + int(grading_output["Faithfulness score"]))/2 - existing_grades[scenario_id][interaction_id][model_family+"_"+model_name+"_modelname"].setdefault("problem_score_avg", (str(avg_score))) - model_dictionary[model_family][model_name].append(avg_score) - - -def pddl_response_and_answer_questions(prompt_type="4shot"): + run_llm_evals(context=context, question=question, answer=answer, + initial_prompt=generated_prompt, + question_gen_model="gpt-4.1", + eval_model_family=model_family, + eval_model=model_name, + scenario_id=scenario_id, + interaction_id=interaction_id, + existing_grades=existing_grades, + context_word_count=context_word_count, + ground_truth=ground_truth, + enable_basic_refined_prompt=enable_basic_refined_prompt, + enable_lecturing_refined_prompt=enable_lecturing_refined_prompt) + +def generate_and_evaluate_llm_response(prompt_type="4shot", arguments=None): # Parse through the preprocessed json data contained in parsed_womdr_data/ + # Loop through all scenarios in the preprocessed data. for scenario_id in scenario_domain_and_problem_data.keys(): + existing_grades = {} existing_grades.setdefault(scenario_id, {}) + current_scenario_index = scenario_domain_and_problem_data[scenario_id]["Scenario Index"] + if (arguments.scenario_index!=None) and (current_scenario_index!=arguments.scenario_index): + continue + existing_grades[scenario_id].setdefault("Scenario Index", current_scenario_index) + + # Looping through all interactions in a given scenario. for interaction_id in scenario_domain_and_problem_data[scenario_id]["Interactions"].keys(): existing_grades[scenario_id].setdefault(interaction_id, {}) - - ##### ===================== Automatic model evaluation with LLM grades on outputs ============== ######### + existing_grades[scenario_id][interaction_id].setdefault("Given Question", scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["problem_data"]) + + existing_grades[scenario_id][interaction_id].setdefault("Ground Truth Answer", scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["answer_data"]) + + # Set of transformation from the string parameter to the boolean parameter. + # It is this way because Popen needs string parameters which is in turn needed for parallel experiment running. + if arguments.ground_truth=="True": bool_ground_truth = True + elif arguments.ground_truth=="False": bool_ground_truth = False + + if arguments.enable_basic_refined_prompt=="True": bool_enable_basic_refined_prompt = True + elif arguments.enable_basic_refined_prompt=="False": bool_enable_basic_refined_prompt = False + + if arguments.enable_lecturing_refined_prompt=="True": bool_enable_lecturing_refined_prompt = True + elif arguments.enable_lecturing_refined_prompt=="False": bool_enable_lecturing_refined_prompt = False + # LLM outputs and LLM as a judge grade generation for one interaction within one scenario. grade_openai_deepinfra_models_one_interaction(model_dictionary=model_dictionary, existing_grades=existing_grades, scenario_id=scenario_id, interaction_id=interaction_id, - prompt_type=prompt_type) + prompt_type=prompt_type, + ground_truth=bool_ground_truth, + enable_basic_refined_prompt=bool_enable_basic_refined_prompt, + enable_lecturing_refined_prompt=bool_enable_lecturing_refined_prompt) - #Ensure that this json file by the name grades/deepseek_grades.json exists first. - with open("grades/direct/deepseek_grades_direct_"+prompt_type+".json", 'w') as grade_file: - print("Existing grades is given by {}".format(existing_grades)) - json.dump(existing_grades, grade_file, indent=4) - grade_file.close() + # Creating the grade files now. + # Ensure that this json file by the name grades/deepseek_grades.json exists first. + if bool_ground_truth==False: + with open("grades/grades_"+"scenario_index_"+str(current_scenario_index)+"_prompt_type_"+prompt_type+"_"+scenario_id+".json", 'w') as grade_file: + print("Existing grades is given by {}".format(existing_grades)) + json.dump(existing_grades, grade_file, indent=4) + grade_file.close() + else: + with open("grades/grades_"+"scenario_index_"+str(current_scenario_index)+"_prompt_type_"+"ground_truth"+"_"+scenario_id+".json", 'w') as grade_file: + print("Existing grades is given by {}".format(existing_grades)) + json.dump(existing_grades, grade_file, indent=4) + grade_file.close() -def main(): +def parse_arguments(): + arguments = parser.parse_args() + print(f"The argument is {arguments.nshot}") + if arguments.nshot=="4shot" or arguments.nshot=="0shot" or arguments.nshot=="2shot" or arguments.nshot=="6shot": + prompt_type = arguments.nshot + else: prompt_type = "2shot" + print(f"Prompt type is {prompt_type}") + print(f"The scenario index argument is {arguments.scenario_index}") + print(f"Enable basic refined prompt has been set to {arguments.enable_basic_refined_prompt}") + print(f"Enable lecturing refined prompt has been set to {arguments.enable_lecturing_refined_prompt}") + return arguments, prompt_type - # Change parameter here depending on the prompt. - prompt_type = "direct" - pddl_response_and_answer_questions(prompt_type=prompt_type) +def plot_correctness_scores(arguments): for model_provider in model_dictionary.keys(): for model in model_dictionary[model_provider].keys(): plt.bar([i for i in range(len(model_dictionary[model_provider][model]))], model_dictionary[model_provider][model]) + plt.title(f"Correctness Scores for Scenario Index {arguments.scenario_index}") + plt.xlabel("Interactions") + plt.ylabel("Correctness Scores") plt.show() + +def main(): + arguments, prompt_type = parse_arguments() + generate_and_evaluate_llm_response(prompt_type=prompt_type, arguments=arguments) + plot_correctness_scores(arguments=arguments) + main() \ No newline at end of file diff --git a/parallel_experiments.py b/parallel_experiments.py new file mode 100644 index 0000000..fcec46e --- /dev/null +++ b/parallel_experiments.py @@ -0,0 +1,32 @@ +import subprocess + +print("Running nshot prompting experiments \n") +print("All scripts requested will run in the background and will return results in the grades folder soon!") + +scenario_index_list_small = [239, 562, 999, 2827, 475] +scenario_index_list_medium = [6, 254, 622, 136, 182] +scenario_index_list_large = [52, 13, 41, 102, 600] + +scenario_indices_all = scenario_index_list_small + scenario_index_list_medium + scenario_index_list_large + +for index in scenario_index_list_large: + process_1 = subprocess.Popen(["C:\\Users\\ishaa\\anaconda3\\Scripts\\activate.bat", "C:\\Users\\ishaa\\anaconda3", + "&&", "conda", "activate", "car_beh_gen", + "&&", "python", "llm_qa_direct_only.py", "--nshot", "0shot", + "--scenario_index", str(index), "--ground_truth", "False", + "--enable_basic_refined_prompt", "False", "--enable_lecturing_refined_prompt", "True"]) + process_2 = subprocess.Popen(["C:\\Users\\ishaa\\anaconda3\\Scripts\\activate.bat", "C:\\Users\\ishaa\\anaconda3", + "&&", "conda", "activate", "car_beh_gen", + "&&", "python", "llm_qa_direct_only.py", "--nshot", "2shot", + "--scenario_index", str(index), "--ground_truth", "False", + "--enable_basic_refined_prompt", "False", "--enable_lecturing_refined_prompt", "True"]) + process_3 = subprocess.Popen(["C:\\Users\\ishaa\\anaconda3\\Scripts\\activate.bat", "C:\\Users\\ishaa\\anaconda3", + "&&", "conda", "activate", "car_beh_gen", + "&&", "python", "llm_qa_direct_only.py", "--nshot", "4shot", + "--scenario_index", str(index), "--ground_truth", "False", + "--enable_basic_refined_prompt", "False", "--enable_lecturing_refined_prompt", "True"]) + process_4 = subprocess.Popen(["C:\\Users\\ishaa\\anaconda3\\Scripts\\activate.bat", "C:\\Users\\ishaa\\anaconda3", + "&&", "conda", "activate", "car_beh_gen", + "&&", "python", "llm_qa_direct_only.py", "--nshot", "6shot", + "--scenario_index", str(index), "--ground_truth", "False", + "--enable_basic_refined_prompt", "False", "--enable_lecturing_refined_prompt", "True"]) \ No newline at end of file diff --git a/parse_scenario_womd.py b/parse_scenario_womd.py index a84a838..2493264 100644 --- a/parse_scenario_womd.py +++ b/parse_scenario_womd.py @@ -1,14 +1,20 @@ import json -from guidance import models, gen, user, assistant, system from openai import OpenAI import os -from rouge import Rouge +import re +from sklearn.decomposition import PCA +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from sklearn.metrics.pairwise import cosine_similarity +from tqdm import tqdm +from pathlib import Path -scenario_files = os.listdir("../car_beh_gen/datasets/training.tar/training/training/") -scenario_blocklist = ['3e9622a454291617'] +scenario_files = os.listdir("../training/") +scenario_blocklist = [] def generate_womd_reasoning_datapoint(filename): - with open('../car_beh_gen/datasets/training.tar/training/training/'+filename, 'r') as file: + with open('../training/'+filename, 'r') as file: data = json.loads(file.read()) new_data_no_interactions = { 'environment questions': data['env_q'], @@ -34,16 +40,20 @@ def process_womd_datapoint_for_mcq_gen(womd_datapoint): environment_facts += womd_datapoint['env_q'][index] environment_facts += " " environment_facts += womd_datapoint['env_a'][index] + environment_facts += " " for index in range(len(womd_datapoint['ego_q'])): ego_facts += womd_datapoint['ego_q'][index] ego_facts += " " ego_facts += womd_datapoint['ego_a'][index] + ego_facts += " " for index in range(len(womd_datapoint['sur_q'])): surr_facts += womd_datapoint['sur_q'][index] surr_facts += " " surr_facts += womd_datapoint['sur_a'][index] + if index != (len(womd_datapoint['sur_q']) - 1): + surr_facts += " " facts = { "Facts about the static environment": environment_facts, @@ -61,11 +71,213 @@ def process_womd_datapoint_for_mcq_gen(womd_datapoint): return facts, mcq_qa_information -def obtain_and_write_mcq_data(start, end): - for filename in scenario_files[start:end]: +def obtain_smallest_scenario_index(start, end): + filesize = 100000 + smallest_scenario_index = 1 + for index in range(len(scenario_files[start:end])): + file_size = os.path.getsize('../training/'+scenario_files[start+index]) + if file_size < filesize: + filesize = file_size + smallest_scenario_index = start+index + print(f"Smallest scenario index is {smallest_scenario_index}") + + +def transform_datapoint_to_qa_list(wmo_reasoning_datapoint): + combined_qa_list = [] # A list that will combine all the QA lists in one datapoint. This + # will result in one large list of strings. + + datapoint_qa_keys = ["env_q", "env_a", "ego_q", "ego_a", "sur_q", "sur_a", "int_q", "int_a"] + for qa_key in datapoint_qa_keys: combined_qa_list += wmo_reasoning_datapoint[qa_key] + + return combined_qa_list + +# Function to implement k nearest neighbour prompting +# The notion of nearest is based on the OpenAI and Gemini embedding representation of the +# (1) The context (2) Both the context and one interaction question. +def find_similar_data_emb_based(search_range_start=100, + search_range_end=200, + consider_scenario_context=True, + consider_scenario_interactions=True, + embedding_model_type="small", + number_of_clusters=5): + + client = OpenAI() + if embedding_model_type==("small"): + embedding_dimensions = 1536 + embedding_model_name = "text-embedding-3-small" + elif embedding_model_type==("large"): + embedding_dimensions = 3072 + embedding_model_name = "text-embedding-3-large" + + # Storing embeddings in a single numpy array for dimensionality reduction later. + embedding_vector_array = np.zeros((len(scenario_files[search_range_start: search_range_end]), embedding_dimensions)) # text embedding small generates embedding vectors of the dimension of 1536 + + for i in tqdm(range(len(scenario_files[search_range_start: search_range_end]))): + datapoint_search_candidate = generate_womd_reasoning_datapoint(filename=scenario_files[search_range_start+i]) + + facts, qa_info = process_womd_datapoint_for_mcq_gen(datapoint_search_candidate) + context_and_interactions = "" + + if consider_scenario_interactions==False and consider_scenario_context==False: + print("No scenario information to work with, please try again!") + break + + if consider_scenario_context==True: + context_and_interactions += facts["Facts about the static environment"] + context_and_interactions += facts["Facts about the ego vehicle in this environment"] + context_and_interactions += facts["Facts about the agents surrounding the ego vehicle in this environment"] + elif consider_scenario_interactions==True: + for i in range(len(datapoint_search_candidate['int_q'])): + context_and_interactions += datapoint_search_candidate['int_q'][i] + context_and_interactions += datapoint_search_candidate['int_a'][i] + + emb_response = client.embeddings.create( + input=context_and_interactions, + model=embedding_model_name # Experiment with the large embedding model as well as the Gemini model. + ) + embedding_vector = emb_response.data[0].embedding + embedding_vector_array[i] = embedding_vector + + # embedding_vector_array[len(scenario_files[search_range_start: search_range_end])] = ref_scenario_embedding_final #n-1th index. + + # Reduce dimensionality using PCA. This allows for observations of the embeddings on a graph. + pca = PCA(n_components=2) + low_dim_embeddings = pca.fit_transform(embedding_vector_array) + + plot_x_list = [] + plot_y_list = [] + + print(f"\n Low dimension embeddings are {len(low_dim_embeddings)}\n") # Should be 1 greater than high dimension embeddings if we are adding the reference scenario from above as well. + print(f"\n High dimension embeddings rows are {len(scenario_files[search_range_start: search_range_end])}\n") + + # K-means clustering + kmeans = KMeans(n_clusters=number_of_clusters, init="k-means++", random_state=42) + kmeans.fit(low_dim_embeddings) # K-means clustering of the low dimension representation of the embeddings + labels = kmeans.labels_ + cluster_centers = kmeans.cluster_centers_ + color_list = ["blue", "orange", "green", "red", "purple"] + + # Low dimension embeddings will have corresponding indices compared to high dimension embeddings. + for i in range(len(low_dim_embeddings)): + plt.scatter(low_dim_embeddings[i][0], low_dim_embeddings[i][1], color=color_list[labels[i]]) + plt.annotate(text=str(search_range_start+i), + xy=(low_dim_embeddings[i][0], low_dim_embeddings[i][1]), + xytext=(low_dim_embeddings[i][0]+0.001, low_dim_embeddings[i][1]+0.001), + fontname='monospace', + fontsize='xx-small') + + plt.scatter([i[0] for i in cluster_centers], [i[1] for i in cluster_centers], marker="X") + plt.title(f"Embeddings between the indices {search_range_start} and {search_range_end}") + plt.show() + + +# Find scenarios similar to the given scenario +def find_similar_data(scenario_index, search_range_start, search_range_end): + # Ensure that only one scenario is used as the reference for comparison. + + highest_similarity_score = 0 + second_highest_similarity_score = 0 + + highest_similarity_index = 0 + second_highest_similarity_index = 0 + + lowest_similarity_score = 10000 + second_lowest_similarity_score = 10000 + + lowest_similarity_index = 0 + second_lowest_similarity_index = 0 + + for filename in scenario_files[scenario_index: scenario_index+1]: + datapoint_reference = generate_womd_reasoning_datapoint(filename=filename) + combined_qa_list_reference = transform_datapoint_to_qa_list(datapoint_reference) + + # Within the datapoint above, we have keys about the following concepts: + # scene id, ego id, start time, end time, main dataset ids for surrounding agents, current dataset ids for + # surrounding agents. env_q, env_a for environment questions. ego_q, ego_a for ego questions. + # sur_q, sur_a for surrounding agent questions. int_q and int_a for interaction related questions. + # It is observed that the questions and answers are NOT consistent across all datapoints. + + # Using the search_range_start and search_range_end indices, we will search all these scenario file indices to find + # the most similar scenario. Firstly, the questions under each category need to be similar and then the corresponding answers need to be similar. + # We will compute a match score based on the following factors: + # number of elements in the current dataset ids list mentioned above. If it's within +-2, we add one point. + # QA similarity for each of the 3 question categories. Add one point for each question being matched and each answer being matched. + + number_of_surrounding_reference = len(datapoint_reference["rel_qa_id"]) + scenario_similarity_score_collection = {} # IDs mapped to similarity scores. + scenario_similarity_score_search_candidate = 0 + + for i in range(len(scenario_files[search_range_start: search_range_end])): + if i==scenario_index: continue + datapoint_search_candidate = generate_womd_reasoning_datapoint(filename=scenario_files[search_range_start+i]) + number_of_surrounding_search_candidate = len(datapoint_search_candidate["rel_qa_id"]) + + # Check if the number of surrounding agents in the search candidate are close enough to the reference scenario. + if abs((number_of_surrounding_reference - number_of_surrounding_search_candidate)) <= 2: scenario_similarity_score_search_candidate += 1 + + combined_qa_list = transform_datapoint_to_qa_list(datapoint_search_candidate) + + # Turn the search candidate into one string where we can search for similarities. + combined_qa_search_candidate = "" + combined_qa_search_candidate = combined_qa_search_candidate.join(combined_qa_list) + + # Using the regex library to search for matching patterns. + # Iterate through each question and answer in the reference and add a point each time there is a string match in the candidate. + for search_term in combined_qa_list_reference: + matches = re.search(search_term, combined_qa_search_candidate) + + # Add one point each time a match to either a question or an answer is provided. + if (matches is not None): + scenario_similarity_score_search_candidate += 1 + + # Modifying the highest score value + if scenario_similarity_score_search_candidate > highest_similarity_score: + second_highest_similarity_score = highest_similarity_score + second_highest_similarity_index = highest_similarity_index + + highest_similarity_score = scenario_similarity_score_search_candidate + highest_similarity_index = search_range_start+i + + # Modifying the lowest score value + elif scenario_similarity_score_search_candidate < lowest_similarity_score: + second_lowest_similarity_score = lowest_similarity_score + second_lowest_similarity_index = lowest_similarity_index + + lowest_similarity_score = scenario_similarity_score_search_candidate + lowest_similarity_index = search_range_start+i + + # Add to dictionary anyway + scenario_similarity_score_collection.setdefault("Index_number_"+str(search_range_start+i), [datapoint_search_candidate["sid"], str(scenario_similarity_score_search_candidate)]) + scenario_similarity_score_search_candidate = 0 + + with open("scenario_similarity_ref_"+str(scenario_index)+"_search_"+str(search_range_start)+"_"+str(search_range_end)+".json", 'w') as file: + json.dump(scenario_similarity_score_collection, file, indent=4) + + print("\n The highest similarity index is {}".format(highest_similarity_index)) + print("\n The highest similarity score for this index is {}".format(highest_similarity_score)) + + print("\n The second highest similarity index is {}".format(second_highest_similarity_index)) + print("\n The second highest similarity score for this index is {}".format(second_highest_similarity_score)) + + obtain_and_write_data_single_scenario(highest_similarity_index) + print("\n This index has been parsed and is in the parsed/... folder") + + obtain_and_write_data_single_scenario(second_highest_similarity_index) + print("\n This index has also been parsed and is in the parsed/... folder") + + print("\n The lowest similarity index is {}".format(lowest_similarity_index)) + print("\n The lowest similarity score for this index is {}".format(lowest_similarity_score)) + + print("\n The second lowest similarity index is {}".format(second_lowest_similarity_index)) + print("\n The second lowest similarity score for this index is {}".format(second_lowest_similarity_score)) + + return scenario_similarity_score_collection + +def obtain_and_write_data(start, end): + for i in range(len(scenario_files[start:end])): blocklist_match = False final_preprocessed_data = {} - womd_datapoint = generate_womd_reasoning_datapoint(filename=filename) + womd_datapoint = generate_womd_reasoning_datapoint(filename=scenario_files[start+i]) id = womd_datapoint['sid'] # Add bad scenarios to the blocklist @@ -78,19 +290,63 @@ def obtain_and_write_mcq_data(start, end): facts, mcq_info = process_womd_datapoint_for_mcq_gen(womd_datapoint=womd_datapoint) reference_context = facts["Facts about the static environment"]+facts["Facts about the ego vehicle in this environment"]+facts["Facts about the agents surrounding the ego vehicle in this environment"] preprocessed_data = {} + preprocessed_data["Size"] = os.path.getsize('../training/'+scenario_files[start+i]) preprocessed_data["Context"] = reference_context + + context_word_count = len(reference_context.split(" ")) + preprocessed_data["Word Count"] = context_word_count + preprocessed_data["Scenario Index"] = start+i + + if preprocessed_data['Size'] > 10000: + preprocessed_data['Scenario Category'] = "C" + elif 6000 <= preprocessed_data['Size'] <= 9999: + preprocessed_data['Scenario Category'] = "B" + elif preprocessed_data['Size'] < 6000: + preprocessed_data['Scenario Category'] = "A" + preprocessed_data["Interactions"] = {} - for i in range(len(mcq_info)): #Iterate over the mcqs generated + for mcq_info_i in range(len(mcq_info)): #Iterate over the mcqs generated original_qa_data = {} - reference_question = womd_datapoint['int_q'][i] - reference_answer = womd_datapoint['int_a'][i] - + reference_question = womd_datapoint['int_q'][mcq_info_i] + reference_answer = womd_datapoint['int_a'][mcq_info_i] + original_qa_data["reference_question"] = reference_question original_qa_data["reference_answer"] = reference_answer - preprocessed_data["Interactions"]["Interactions_"+str(i)] = original_qa_data + preprocessed_data["Interactions"]["Interactions_"+str(mcq_info_i)] = original_qa_data final_preprocessed_data[str(id)] = preprocessed_data - with open("parsed_womdr_data/"+str(id)+".json", 'w') as file: + # Do not change the id and json extension at the end since this is used by the parser + with open("parsed_womdr_data/"+"scenario_index_"+str(start+i)+"_scenario_id_"+str(id)+".json", 'w') as file: json.dump(final_preprocessed_data, file, indent=4) +# In case you're working with one scenario index at a time. + +def obtain_and_write_data_single_scenario(scenario_index): + obtain_and_write_data(scenario_index, scenario_index+1) + +# Comment out lines as necessary + +# find_similar_data(254, 400, 1000) # This includes the obtain function below btw + +#From the initial 60 experiments: + +scenario_index_list_small = [239, 562, 999, 2827, 475] +scenario_index_list_medium = [6, 254, 622, 136, 182] +scenario_index_list_large = [52, 13, 41, 102, 600] + +scenario_indices_all = scenario_index_list_small+scenario_index_list_medium+scenario_index_list_large + + +for scenario_index in scenario_indices_all: + obtain_and_write_data_single_scenario(scenario_index) + +# obtain_smallest_scenario_index(0, 51855) + + +# find_similar_data_emb_based(search_range_start=250, +# search_range_end=280, +# consider_scenario_context=True, +# consider_scenario_interactions=False, +# embedding_model_type="small", +# number_of_clusters=3) diff --git a/parsed_data_retrieval.py b/parsed_data_retrieval.py new file mode 100644 index 0000000..c69b516 --- /dev/null +++ b/parsed_data_retrieval.py @@ -0,0 +1,32 @@ +import os +import json + +def retrieve_parsed_data(): + + parsed_womdr_files = os.listdir("parsed_womdr_data/") + scenario_domain_problem_data = {} + + for i in parsed_womdr_files: + with open("parsed_womdr_data/"+i, 'r') as scenario_file: + scenario_data = json.load(scenario_file) + print(f"number of scenarios are {scenario_data.keys()}") + current_scenario_id = i[-21:-5] + for key in scenario_data.keys(): + # Indices here have been planned based on the Waymo Reasoning dataset files + scenario_domain_problem_data.setdefault(current_scenario_id, { + "Context": "" + }) + scenario_domain_problem_data[current_scenario_id].setdefault("Scenario Index", scenario_data[key]["Scenario Index"]) + scenario_domain_problem_data[current_scenario_id]["Context"] = scenario_data[key]["Context"] + scenario_domain_problem_data[current_scenario_id]["Word Count"] = scenario_data[key]["Word Count"] + #print(f"number of interactions in this scenario are {scenario_data[key]["Interactions"].keys()}") + for interaction_key in scenario_data[key]["Interactions"].keys(): + scenario_domain_problem_data[current_scenario_id].setdefault("Interactions", {}) + scenario_domain_problem_data[current_scenario_id]["Interactions"].setdefault(interaction_key, { + "problem_data": "", + "answer_data": "" + }) + scenario_domain_problem_data[current_scenario_id]["Interactions"][interaction_key]["problem_data"] = scenario_data[key]["Interactions"][interaction_key]["reference_question"] + scenario_domain_problem_data[current_scenario_id]["Interactions"][interaction_key]["answer_data"] = scenario_data[key]["Interactions"][interaction_key]["reference_answer"] + + return scenario_domain_problem_data \ No newline at end of file diff --git a/pddl_gen.py b/pddl_gen.py new file mode 100644 index 0000000..f187cd6 --- /dev/null +++ b/pddl_gen.py @@ -0,0 +1,276 @@ +# Script to define functions that return pddl prompts +from client_model_setup import ProvidedLLM +from pathlib import Path +import pddlpy +import json + +class PDDLGen(): + def __init__(self, client, model): + self.provided_llm = ProvidedLLM() + self.pddl_domain = "" + self.pddl_problem = "" + self.client = client + self.model = model + + def llm_call(self, prompt, dictionary_mode=False, output_thoughts=False): + output_, thoughts = self.provided_llm.llm_call(client=self.client, model=self.model, prompt=prompt) + if output_thoughts==True: + return thoughts + else: + if dictionary_mode==True: # Asked LLM to generate dictionary output in the prompt. + output = eval(output_) + return output + return output_ + + def write_pddls(self, write_domain=False, write_problem=False, + scenario_id="", interaction_id="", + domain_info="", + problem_info=""): + attempted_overwrite = False + dir_path_text = "apla-planner/generated_pddls_deepseek/dataset/domains/"+scenario_id + if write_domain==True: + pddl_domain_path = dir_path_text+"/domain_deepseek_chat_"+scenario_id+".pddl" + try: + dir_path = Path(dir_path_text) + dir_path.mkdir() + with open(pddl_domain_path, "w", encoding='utf-8') as file: + file.write(domain_info) # We want to read the article as a single string, so that we can feed it to gpt. + file.close() + except FileExistsError: + print(f""" + Attempted domain file overwrite for scenario id {scenario_id}. + This is to reduce repitition in PDDL generation. If you want to regenerate domain files for this scenario, + please delete the domains folder for this scenario first. + + Skipping PDDL gen for scenario id {scenario_id} + """) + attempted_overwrite=True + return attempted_overwrite + elif write_problem==True: + dir_path_text_problem = "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_id + pddl_problem_path = dir_path_text_problem+"/problem_deepseek_chat_"+interaction_id+".pddl" + print("PDDL problem path is {}".format(pddl_problem_path)) + try: + # Try creating folder if it doesn't exist. Create only file if it does. + dir_path_problem = Path(dir_path_text_problem) + dir_path_problem.mkdir() + with open(pddl_problem_path, "w", encoding='utf-8') as file: + file.write(problem_info) # We want to read the article as a single string, so that we can feed it to gpt. + file.close() + except FileExistsError: + with open(pddl_problem_path, "w", encoding='utf-8') as file: + file.write(problem_info) # We want to read the article as a single string, so that we can feed it to gpt. + file.close() + return attempted_overwrite + + def generate_action_prompt(self, scenario_domain_problem_data_context): + action_prompt = f""" + Based on the information detailed in {scenario_domain_problem_data_context}, + * Write down a list of actions that map between states in natural language. + * Each action has some causal states (predicates) and some effect states that will be true or false. + * Each action has a strong logical connection between any number of causal states and any number of effect states. + * States in an action description must not contradict each other. + * Action names must be descriptive and the action can be understood just by looking at the name. + * The state names within each action are also descriptive. The cause and effect statements and the state names must have the same information. + * There must be separate states regarding the environment, ego and the respective surrounding agents. + * In each action and state, the ego agent or the surrounding agent must be identified as or or as needed. + * For distances, positions and speeds do not use specific numbers but words instead such as front, left, right, near, far, fast, slow, medium (or combinations such as front-left and so on) or other similar descriptive words. + * The action itself will only become true when the causal states and the effect states are in the specific states that this description details. + * Write them in the following format: + + "": + + "": + "statement": " + "value": , + "state_type": + , + "": + "statement": " + "value": , + "state_type": + + , + ... + + + No json tags to be used. Just the dictionary in the output. Nothing else, nothing else, nothing else. + """ + return action_prompt + + + def generate_domain_prompt(self, scenario_domain_problem_data_context, generated_actions): + domain_prompt = f""" + I have an autonomous vehicle test scenario described here: {scenario_domain_problem_data_context}. I want you to formalize the + information to more explicitly write the logic regarding the various driving behaviors in this information. Regarding this information, I first generated + action descriptions here: {generated_actions} + + Now for these action descriptions, please generate a PDDL (Planning Domain Definition Language) domain file. I only want the contents that would be in + such a file, no other information in your writing. Keep in mind that this content will be entered into a file with a .pddl extension and saved. + Please ensure that all the generated states and actions are absolutely correct with respect to the given information. + + Please ensure that everything is very clear and correct. Please make use of good names that are readable. Please check and double check your work before writing. + No pddl, lisp or any other tags to be used. Just the pddl lines in the output. Please do not write anything else other than precisely what has been asked. + """ + return domain_prompt + + def generate_pddl_domain(self, scenario_domain_problem_data_context, scenario_id): + action_prompt = self.generate_action_prompt(scenario_domain_problem_data_context=scenario_domain_problem_data_context) + action_json = self.llm_call(action_prompt) + print("Action json is {}".format(action_json)) + domain_prompt = self.generate_domain_prompt(scenario_domain_problem_data_context=scenario_domain_problem_data_context, generated_actions=action_json) + self.pddl_domain = self.llm_call(domain_prompt) + print("PDDL domain is {}".format(self.pddl_domain)) + attempted_overwrite = self.write_pddls(write_domain=True, domain_info=self.pddl_domain, scenario_id=scenario_id) + return self.pddl_domain, attempted_overwrite, action_json + + def generate_initial_problem_prompt(self, scenario_domain_problem_data_context, + generated_actions, + scenario_domain_problem_data_problem_data, domain): + problem_initial_prompt = f""" + I have an autonomous vehicle test scenario described here: {scenario_domain_problem_data_context}. + + I wanted you to formalize the information to more explicitly and write the logic regarding driving behaviors contained in this information. + Regarding this information, I first generated action descriptions here: + {generated_actions}. + + From all of this information, I generated the following PDDL (Planning Domain Definition Language) Domain model here: + {domain}. + + In addition to everything else above, I have some more pertinent information here regarding the PDDL problem corresponding to the PDDL domain above: + {scenario_domain_problem_data_problem_data} + + First, please repeat the types, states (predicates) and actions in this file in your mind. + Then think step by step about a PDDL problem for this PDDL domain. Please think about whether this problem does indeed have a solution. In other words, whether a plan exists for this problem. + Now, please generate a PDDL (Planning Domain Definition Language) problem file. I only want the contents that would be in + such a file, no other information in your writing. Keep in mind that this content will be entered into a file with a .pddl extension and saved so no extra information should be contained. + + No pddl, lisp or any other tags to be used. Just the pddl lines in the output. No tags. No tags. No tags. + """ + return problem_initial_prompt + + def generate_final_problem_prompt(self, + scenario_domain_problem_data_context, + generated_actions, + domain, + scenario_domain_problem_data_problem_data, + initial_problem): + final_problem_prompt = f""" + I have an autonomous vehicle test scenario described here: {scenario_domain_problem_data_context}. + + I want you to formalize the information to more explicitly write the logic regarding the various driving behaviors in this information. Regarding this information, I first generated + action descriptions here: {generated_actions}. Then I generated the following PDDL (Planning Domain Definition Language) Domain model here: {domain}. + + In addition to everything else above, regarding the PDDL problem file, I have some pertinent information here: {scenario_domain_problem_data_problem_data} + + From all of this information, I generated the PDDL problem file. Carefully read this PDDL problem file: + {initial_problem}. + + Please consider all the information above and generate a refined, correct and better quality PDDL problem file. Thank you! + + Again, I only want the contents that would be in such a file, no other information in your writing. Keep in mind that this content will be entered into a file with a .pddl extension and saved so no extra information should be contained. + + No pddl, lisp or any other tags to be used. Just the pddl lines in the output. No tags. No tags. No tags. + """ + return final_problem_prompt + + def generate_pddl_problem(self, scenario_domain_problem_data_context, generated_actions, domain, scenario_problem_data, scenario_id, interaction_id): + initial_problem_prompt = self.generate_initial_problem_prompt(scenario_domain_problem_data_context=scenario_domain_problem_data_context, + generated_actions=generated_actions, + scenario_domain_problem_data_problem_data=scenario_problem_data, + domain=domain) + initial_pddl_problem = self.llm_call(initial_problem_prompt) + final_problem_prompt = self.generate_final_problem_prompt(scenario_domain_problem_data_context, + generated_actions, + domain, + scenario_domain_problem_data_problem_data=scenario_problem_data, + initial_problem=initial_pddl_problem) + self.pddl_problem = self.llm_call(final_problem_prompt) + print("PDDL problem is {}".format(self.pddl_problem)) + self.write_pddls(write_problem=True, scenario_id=scenario_id, problem_info=self.pddl_problem, interaction_id=interaction_id) + return self.pddl_problem + + + def generate_llm_pddl_judge_prompt(self, + scenario_domain_problem_data_context, + domain, + scenario_domain_problem_data_problem_data, + problem_final): + llm_pddl_judge_prompt = f""" + First, read the context information for the given scenario: + {scenario_domain_problem_data_context} + + Now, carefully read the generated domain file: + {domain} + + Now, carefully review the problem data in the scenario: + {scenario_domain_problem_data_problem_data} + + Carefully read this PDDL problem file: + {problem_final}. + + Now score the generated domain and problem PDDL files according to the given rubric: + + 1. Consistency: Are the facts in the context information above consistently and correctly presented in the domain and problem files? Rate this output on a scale of 1 to 10. Explain your rating. + 2. Domain coverage: Does the generated domain PDDL domain file adequately cover the information in the context above? Rate this output on a scale of 1 to 10. Explain your rating. + 3. Problem coverage: Does the generated problem PDDL file adequately cover the given problem data as presented above? The problem data asks specific questions with respect to the context. + Therefore, you must rate the coverage with respect to this specific question only. Rate this output on a scale of 1 to 10. Explain your rating. + + Format your output exactly in the following manner: + + "Consistency": + + "Score explanation": "", + "Grade": "" + , + "Domain coverage": + + "Score explanation": "", + "Grade": "" + , + "Problem coverage": + + "Problem data provided": "" + "Score explanation": "", + "Grade": "" + + + + No tags. Just the dictionary in the output. Nothing else, nothing else. + """ + return llm_pddl_judge_prompt + + def generate_llm_eval(self, scenario_domain_problem_data_context="", + domain="", + scenario_domain_problem_data_problem_data="", + problem_final="", + scenario_id="", + interaction_id=""): + + llm_pddl_judge_prompt = self.generate_llm_pddl_judge_prompt(scenario_domain_problem_data_context=scenario_domain_problem_data_context, + domain=self.pddl_domain, + scenario_domain_problem_data_problem_data=scenario_domain_problem_data_problem_data, + problem_final=self.pddl_problem) + llm_eval = self.llm_call(llm_pddl_judge_prompt, dictionary_mode=True) + # Each sentence in the scenario context pertains to a fact. + # We can split the context by sentence and count the word count per sentence to get a sense of how difficult the facts are. + # Longer individual sentences would mean more complex facts. + context_sentence_list = scenario_domain_problem_data_context.split(". ") + total_word_count_sentence = 0 + for sentence_index in range(len(context_sentence_list)): + total_word_count_sentence += len(context_sentence_list[sentence_index].split()) + + average_word_count_sentence = int(total_word_count_sentence / len(context_sentence_list)) + llm_eval.setdefault("average_context_sentence_word_count", average_word_count_sentence) + llm_eval.setdefault("total_word_count", total_word_count_sentence) + + domain_problem_files = pddlpy.DomainProblem("apla-planner/generated_pddls_deepseek/dataset/domains/"+scenario_id+"/domain_deepseek_chat_"+scenario_id+".pddl", + "apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_id+"/problem_deepseek_chat_"+interaction_id+".pddl") + llm_eval.setdefault("domain_action_count", len(list(domain_problem_files.operators()))) # List of actions written in the domain. + llm_eval.setdefault("initial_state_size", len(domain_problem_files.initialstate())) # Initial state in the problem file. + + with open("apla-planner/generated_pddls_deepseek/dataset/problems/"+scenario_id+"/LLM_eval_"+interaction_id+".json", "w", encoding='utf-8') as file_eval: + json.dump(llm_eval, file_eval, indent=4) # We want to read the article as a single string, so that we can feed it to gpt. + file_eval.close() + return llm_eval + diff --git a/planner.py b/planner.py index d82817e..4a68a04 100644 --- a/planner.py +++ b/planner.py @@ -1,4 +1,4 @@ -import guidance + import sys import subprocess import os @@ -6,10 +6,12 @@ from pathlib import Path import pddlpy from openai import OpenAI -from client_model_setup import ProvidedLLM from tqdm import tqdm +from pddl_gen import PDDLGen +from client_model_setup import ProvidedLLM +import shutil -provided_llm = ProvidedLLM() #Object contains all the client setup and the model names for that client. +provided_llm = ProvidedLLM() def retrieve_womdr_domain_problem_data(): @@ -25,6 +27,10 @@ def retrieve_womdr_domain_problem_data(): "Context": "" }) scenario_domain_problem_data[i[:-5]]["Context"] = scenario_data[key]["Context"] + scenario_domain_problem_data[i[:-5]]["Word Count"] = scenario_data[key]["Word Count"] + print(f""" + number of interactions in this scenario are {scenario_data[key]["Interactions"].keys()} + """) for interaction_key in scenario_data[key]["Interactions"].keys(): scenario_domain_problem_data[i[:-5]].setdefault("Interactions", {}) scenario_domain_problem_data[i[:-5]]["Interactions"].setdefault(interaction_key, { @@ -36,200 +42,50 @@ def retrieve_womdr_domain_problem_data(): return scenario_domain_problem_data -def resolve_client_and_model(api_type, model_name): - # API_type parameter must be from the following names: - # 1. ds_api - # 2. deepinfra_api - # 3. oai_api - - # For ds models, model names should be from the following: - # 1. ds_v3_dsapi - # 2. ds_r1_dsapi - - # For deepinfra models, model names should be from the following: - # 1. ds_v3, - # 2. llama_33_70b - # 3. llama_31_405b - # 4. qw_25_72b - # 5. ds_distil_llama_70b - # 6. gemma_2 - # 7. llama_31_8b - # 8. qw_25_7b - # 9. phi_4 - - # For OpenAI models, model names should be from the following: - # 1. gpt_4o_mini - # 2. o3_mini - - if api_type=="ds_api": - client = provided_llm.client_dsapi - if model_name=="ds_v3_dsapi": - selected_model = provided_llm.ds_v3_dsapi - elif model_name=="ds_r1_dsapi": - selected_model = provided_llm.ds_r1_dsapi - else: - print("Model name is incompatible with DS api or invalid") - elif api_type=="deepinfra_api": - client = provided_llm.client_deepinfra - if model_name=="ds_v3": - selected_model = provided_llm.ds_v3 - elif model_name=="llama_33_70b": - selected_model = provided_llm.llama_33_70b - elif model_name=="ds_distil_llama_70b": - selected_model = provided_llm.ds_distil_llama_70b - else: - print("model name either incompatible with DeepInfra API or invalid.") - elif api_type=="oai_api": - client = provided_llm.client_oai - selected_model = provided_llm.gpt_4o_mini - else: - print("API type invalid") - - return client, selected_model - -def generate_pddl_with_syntax_check(api_type, model_name): - client, selected_model = resolve_client_and_model(api_type=api_type, model_name=model_name) +# Client and model type are resolved in the function call itself and remain fixed throughout +def generate_pddl_with_syntax_check(client, model): scenario_domain_problem_data = retrieve_womdr_domain_problem_data() for id in tqdm(scenario_domain_problem_data.keys()): - print("\nDomain generation, generating action suggestions....\n") - response_action_json = client.chat.completions.create( - model=selected_model, - messages=[ - {"role": "user", "content": f""" - - Based on the information detailed in {scenario_domain_problem_data[id]["Context"]}, - * Write down a list of actions that map between states in natural language. - * Each action has some causal states (predicates) and some effect states that will be true or false. - * Each action is a cause and effect mapping between any number of causal states and any number of effect states. - * Actions and states must not contradict each other. - * Action names must be descriptive and the action can be understood just by looking at the name. - * The state names within each action are also descriptive. The cause and effect statements and the state names must have the same information. - * There must be separate states regarding the environment, ego and the respective surrounding agents. - * In each action and state, the ego agent or the surrounding agent must be identified as or or as needed. - * For distances, positions and speeds do not use specific numbers but words instead such as front, left, right, near, far, fast, slow, medium (or combinations such as front-left and so on) or other similar descriptive words. - * The action itself will only become true when the causal states and the effect states are in the specific states that this description details. - * Write them in the following format: - - "": - - "": - "statement": " - "value": , - "state_type": - , - "": - "statement": " - "value": , - "state_type": - - , - ... - - - No json tags to be used. Just the dictionary in the output. Nothing else, nothing else, nothing else. - """}, - ], - stream=False - ) + val_error_detected = False + blocklist_ids = [] + with open("apla-planner/generated_pddls_deepseek/dataset/blocklist.json", 'r') as block_file: + blocklist_ids = json.load(block_file) + if blocklist_ids.count(id)>0: continue # Scenario ID detected in blocklist, continue + + # Client and model fixed for all PDDL generation. + pddl_gen = PDDLGen(client=client, model=model) print(f"\nDomain generation, generating domain file for scenario id {id}....\n") - response_domain_initial = client.chat.completions.create( - model=selected_model, - messages=[ - {"role": "user", "content": f""" - We need you to write specific driving behaviors to accomplish certain goals. A behavior is defined as actions taken in response to certain conditions. Conditions are provided as an environment state. - Think about some states by yourself that you believe is necessary. - Vehicles navigate in the action space and the state space provided to them. - - Now generate a PDDL domain file for the scenario: {response_action_json.choices[0].message.content}. Domain file only only only for now. - Think about the STRIPS PDDL for different popular domains such as gripper and sokoban. - Verify whether all the suggested states and actions makes sense and are correct. - If it feels correct, write it down as a PDDL domain file. I only only want the PDDL domain file contents. - - Please keep things really clear. Do not repeat names. Do not repeat names. Do not redefine anything. Ensure that everything is very very clear and correct. Check and double check correctness. - Do not write anything else other than what is asked. Only Only Only write what has been asked. No tags of any sort. Only pure PDDL. Only write what has been asked. Only write what has been asked. - Nothing other than pure PDDL as asked. Nothing other than pure PDDL as asked. Please make sure it is correct. - Do not write ```pddl or ``` or the corresponding closing tags since I'm going to parse these outputs. - - I repeat, do not write ```pddl or ``` or ```lisp or the corresponding closing tags since I'm going to parse these outputs. - I repeat again, do not write ```pddl or ``` or ```lisp or the corresponding closing tags since I'm going to parse these outputs. - """}, - ], - stream=False - ) - - dir_path_text = "apla-planner/generated_pddls_deepseek/dataset/domains/"+id - try: - dir_path = Path(dir_path_text) - dir_path.mkdir() - with open(dir_path_text+"/domain_deepseek_chat_"+id+".pddl", "w", encoding='utf-8') as file: - file.write(response_domain_initial.choices[0].message.content) # We want to read the article as a single string, so that we can feed it to gpt. - file.close() - except FileExistsError: - with open(dir_path_text+"/domain_deepseek_chat_"+id+".pddl", "w", encoding='utf-8') as file: - file.write(response_domain_initial.choices[0].message.content) # We want to read the article as a single string, so that we can feed it to gpt. - file.close() + scenario_domain_problem_data_context = scenario_domain_problem_data[id]["Context"] + domain, attempted_overwrite, generated_actions = pddl_gen.generate_pddl_domain(scenario_domain_problem_data_context=scenario_domain_problem_data_context, + scenario_id=id) + print(domain) + print(generated_actions) + if attempted_overwrite==True: continue # Attempted PDDL domain overwrite, move on to the next scenario. # Given one domain file based on a context, generate multiple problem files. for interaction_id in tqdm(scenario_domain_problem_data[id]["Interactions"].keys()): + if val_error_detected: break print(f"\nProblem generation, generating problem file for interaction {interaction_id}....\n") - response_problem_initial = client.chat.completions.create( - model=selected_model, - messages=[ - {"role": "user", "content": f""" - Now carefully write the PDDL problem file for the corresponding domain file provided: - {response_domain_initial.choices[0].message.content}. - - Consider in addition some problem specific data: {scenario_domain_problem_data[id]["Interactions"][interaction_id]["problem_data"]} - First repeat the types, states (predicates) and actions in this file as a list in natural language. - Then think step by step about a problem for this domain. Think about whether this problem does indeed have a solution plan. - Double check that everything is clear and it does in fact have a solution. Then write the PDDL problem file contents. I only want the problem file contents. - Do not repeat names. Do not repeat names. Only the problem file contents nothing more. Only the problem file contents nothing more. I'm pasting this in a pddl problem file just letting you know. - Do not write anything else other than what is asked. Only Only Only write what has been asked. Only write pure PDDL as asked. - Only write pure PDDL as asked. Only write pure PDDL as asked. - - Do not write ```pddl or ``` or ```lisp or the corresponding closing tags since I'm going to parse these outputs. - """}, - ], - stream=False - ) - - print("\nProblem generation, reviewing and updating problem file....\n") - response_problem_final = client.chat.completions.create( - model=selected_model, - messages=[ - {"role": "user", "content": f""" - Carefully read this PDDL problem file: - {response_problem_initial.choices[0].message.content}. - - It is really important that the ```pddl or ``` or ```lisp opening tags - or the corresponding closing tags do not exist. Do these tags exist in the given PDDL problem file? - Do not write your answer in the output. But if the answer is yes, can you remove the lines with these tags - and rewrite the rest of the PDDL file exactly as it is? The lines with the tags should definitely not be there in the final output. - If the answer is no however, please rewrite the file exactly as it is. Thank you! - - Again, remember that the final output should only have lines of PDDL as instructed above, nothing else, nothing else, nothing else. - """}, - ], - stream=False - ) - - dir_path_text_problem = "apla-planner/generated_pddls_deepseek/dataset/problems/"+id - try: - # Try creating folder if it doesn't exist. Create only file if it does. - dir_path_problem = Path(dir_path_text_problem) - dir_path_problem.mkdir() - with open(dir_path_text_problem+"/problem_deepseek_chat_"+interaction_id+".pddl", "w", encoding='utf-8') as file: - file.write(response_problem_final.choices[0].message.content) # We want to read the article as a single string, so that we can feed it to gpt. - file.close() - except FileExistsError: - with open(dir_path_text_problem+"/problem_deepseek_chat_"+interaction_id+".pddl", "w", encoding='utf-8') as file: - file.write(response_problem_final.choices[0].message.content) # We want to read the article as a single string, so that we can feed it to gpt. - file.close() - - + scenario_domain_problem_problem_data = scenario_domain_problem_data[id]["Interactions"][interaction_id]["problem_data"] + problem = pddl_gen.generate_pddl_problem(domain=domain, + scenario_domain_problem_data_context=scenario_domain_problem_data_context, + generated_actions=generated_actions, + scenario_problem_data=scenario_domain_problem_problem_data, + scenario_id=id, + interaction_id=interaction_id) + print(problem) # Take each domain and problem file pair and run val through it, write it to the corresponding text file. output_val_deepseek_chat = subprocess.run(["Parser", "apla-planner/generated_pddls_deepseek/dataset/domains/"+id+"/domain_deepseek_chat_"+id+".pddl", "apla-planner/generated_pddls_deepseek/dataset/problems/"+id+"/problem_deepseek_chat_"+interaction_id+".pddl"], stdout=subprocess.PIPE).stdout string_output_round2 = str(output_val_deepseek_chat, encoding='utf-8') + if string_output_round2.find("Errors: 0,")==-1: + val_error_detected = True + print("\nOh no val error detected!\n") + blocklist_ids.append(id) + with open("apla-planner/generated_pddls_deepseek/dataset/blocklist.json", 'w') as block_file: + json.dump(blocklist_ids, block_file, indent=4) + block_file.close() + print("\nAdding scenario to blocklist\n") + break #Exit the for loop for this set of interactions. with open("apla-planner/generated_pddls_deepseek/dataset/problems/"+id+"/val_output_"+interaction_id+".txt", "w", encoding='utf-8') as file: file.write(string_output_round2) # We want to read the article as a single string, so that we can feed it to gpt. file.close() @@ -309,78 +165,22 @@ def generate_pddl_with_syntax_check(api_type, model_name): # file.close() print("\nLLM grading for PDDL file generation....\n") - response_LLM_judgement = client.chat.completions.create( - model=selected_model, - messages=[ - {"role": "user", "content": f""" - First, read the context information for the given scenario: - {scenario_domain_problem_data[id]["Context"]} - - Now, carefully read the generated domain file: - {response_domain_initial.choices[0].message.content} - - Now, carefully review the problem data in the scenario: - {scenario_domain_problem_data[id]["Interactions"][interaction_id]["problem_data"]} - - Carefully read this PDDL problem file: - {response_problem_final.choices[0].message.content}. - - Now score the generated domain and problem PDDL files according to the given rubric: - - 1. Consistency: Are the facts in the context information above consistently and correctly presented in the domain and problem files? Rate this output on a scale of 1 to 10. Explain your rating. - 2. Domain coverage: Does the generated domain PDDL domain file adequately cover the information in the context above? Rate this output on a scale of 1 to 10. Explain your rating. - 3. Problem coverage: Does the generated problem PDDL file adequately cover the given problem data as presented above? The problem data asks specific questions with respect to the context. - Therefore, you must rate the coverage with respect to this specific question only. Rate this output on a scale of 1 to 10. Explain your rating. - - Format your output exactly in the following manner: - - "Context": "", - "Consistency": - - "Score explanation": "", - "Grade": "" - , - "Domain coverage": - - "Score explanation": "", - "Grade": "" - , - "Problem coverage": - - "Problem data provided": "" - "Score explanation": "", - "Grade": "" - - - - No tags. Just the dictionary in the output. Nothing else, nothing else. - """}, - ], - stream=False - ) - - LLM_eval_dictionary = eval(response_LLM_judgement.choices[0].message.content) - # Each sentence in the scenario context pertains to a fact. - # We can split the context by sentence and count the word count per sentence to get a sense of how difficult the facts are. - # Longer individual sentences would mean more complex facts. - context_sentence_list = scenario_domain_problem_data[id]["Context"].split(". ") - total_word_count_sentence = 0 - for sentence_index in range(len(context_sentence_list)): - total_word_count_sentence += len(context_sentence_list[sentence_index].split()) - - average_word_count_sentence = total_word_count_sentence / len(context_sentence_list) + llm_eval = pddl_gen.generate_llm_eval(scenario_domain_problem_data_problem_data=scenario_domain_problem_problem_data, + domain=domain, + problem_final=problem, + scenario_id=id, + interaction_id = interaction_id, + scenario_domain_problem_data_context=scenario_domain_problem_data_context) - LLM_eval_dictionary.setdefault("average_context_sentence_word_count", average_word_count_sentence) - - domain_problem_files = pddlpy.DomainProblem("apla-planner/generated_pddls_deepseek/dataset/domains/"+id+"/domain_deepseek_chat_"+id+".pddl", - "apla-planner/generated_pddls_deepseek/dataset/problems/"+id+"/problem_deepseek_chat_"+interaction_id+".pddl") - LLM_eval_dictionary.setdefault("domain_action_count", len(list(domain_problem_files.operators()))) # List of actions written in the domain. - LLM_eval_dictionary.setdefault("initial_state_size", len(domain_problem_files.initialstate())) # Initial state in the problem file. - - with open(dir_path_text_problem+"/LLM_eval_"+interaction_id+".json", "w", encoding='utf-8') as file_eval: - json.dump(LLM_eval_dictionary, file_eval, indent=4) # We want to read the article as a single string, so that we can feed it to gpt. - file.close() print(f"\nPDDL problem generation complete for interaction with id {interaction_id}. Progress with interactions shown below\n") + + if val_error_detected==True: + #Delete the domain folder for this scenario id completely. + delete_path_domain = "apla-planner/generated_pddls_deepseek/dataset/domains/"+id + shutil.rmtree(delete_path_domain) + delete_path_problem = "apla-planner/generated_pddls_deepseek/dataset/problems/"+id + shutil.rmtree(delete_path_problem) + continue # Move on to the next scenario id print(f"\nPDDL generation complete for scenario with id {id}. Progress with scenarios shown below\n") def pddl_response_and_answer_questions(): diff --git a/run_experiments.py b/run_experiments.py index 2abb277..aecea0c 100644 --- a/run_experiments.py +++ b/run_experiments.py @@ -3,18 +3,23 @@ # the wsl home ~ folder as stated in this project's README. import subprocess +from client_model_setup import ProvidedLLM import parse_scenario_womd import planner import llm_qa +import sys + print("Running the data preprocessing .... \nThe data will be in the parsed_womdr_data dictionary.") -parse_scenario_womd.obtain_and_write_mcq_data(35, 36) # Take these two arguments via argparse or config +parse_scenario_womd.obtain_and_write_data(125, 126) # Take these two arguments via argparse or config print("Completed data preprocessing!\n") print("Running the PDDL file generation...\nThe domains and problem files will get saved in the apla-planner/generated_pddls_deepseek path within the domains and problems folder.......") -api_type = "ds_api" -model_name = "ds_v3_dsapi" # Take these two arguments via argparse -planner.generate_pddl_with_syntax_check(api_type, model_name) + +provided_llm = ProvidedLLM() +client = provided_llm.client_dsapi +model = provided_llm.ds_v3_dsapi +planner.generate_pddl_with_syntax_check(client, model) print("PDDL problem generation has been completed!\n") print("Running the planner within WSL... \n") @@ -22,6 +27,6 @@ print("Plan generation has been completed!\n") print("Running the LLM evaluations ... \nThe results will be in the grades folder.") -llm_qa.main() +llm_qa.exp_run() print("LLM evaluations have been completed!") diff --git a/run_llm_qa.py b/run_llm_qa.py new file mode 100644 index 0000000..979afe2 --- /dev/null +++ b/run_llm_qa.py @@ -0,0 +1,5 @@ +import llm_qa + +print("Running the LLM evaluations ... \nThe results will be in the grades folder.") +llm_qa.exp_run() +print("LLM evaluations have been completed!") \ No newline at end of file diff --git a/run_parser.py b/run_parser.py new file mode 100644 index 0000000..6432013 --- /dev/null +++ b/run_parser.py @@ -0,0 +1,5 @@ +import parse_scenario_womd + +# Ensure the parse folder and the respective PDDL folders have been deleted before starting this experiment run. +# Current experiment run plan - scenario indices (52, 53), (110, 120), (239, 244), (17, 22) and (84, 88) +parse_scenario_womd.obtain_and_write_data(84, 85) \ No newline at end of file diff --git a/run_planner.py b/run_planner.py new file mode 100644 index 0000000..c8bf05e --- /dev/null +++ b/run_planner.py @@ -0,0 +1,10 @@ +import planner +from client_model_setup import ProvidedLLM + +print("Running the PDDL file generation...\nThe domains and problem files will get saved in the apla-planner/generated_pddls_deepseek path within the domains and problems folder.......") + +provided_llm = ProvidedLLM() +client = provided_llm.client_oai +model = provided_llm.gpt_45 +planner.generate_pddl_with_syntax_check(client, model) +print("PDDL problem generation has been completed!\n") \ No newline at end of file diff --git a/search_plan.py b/search_plan.py new file mode 100644 index 0000000..2cf3218 --- /dev/null +++ b/search_plan.py @@ -0,0 +1,5 @@ +import subprocess + +print("Running the planner within WSL... \n") +subprocess.run(["wsl", "-e", "bash", "-ic", "cd apla-planner/generated_pddls_deepseek ; python planner_test.py"], stdout=subprocess.PIPE).stdout +print("Plan generation has been completed!\n") \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..f169c48 --- /dev/null +++ b/test.py @@ -0,0 +1,37 @@ +# Removal of think tags in output. Test on Deepseek R1 Distill Llama 70B +from client_model_setup import ProvidedLLM +import re + +llm = ProvidedLLM() + +# Setup the client and the model +client = llm.client_deepinfra +model = llm.ds_distil_llama_70b +model_r1_deepinfra = llm.ds_r1 + +client_ds = llm.client_dsapi +model_r1 = llm.ds_r1_dsapi + +prompt = """ +Hi do you have advice for grad school in computer science? Format your response in the following manner: + +"advice": + +"": "", +... +... + + +Your response needs to strictly be in this format. Please do not write anything else outside this format. +""" +output, thoughts = llm.thinking_llm_call(client=client, model=model_r1_deepinfra, prompt=prompt) + +# separated_string = re.split(r"()", output) +# separated_string_thoughts = re.split(r"()", separated_string[0]) +# separated_string_output = separated_string[1] +# separated_string_thoughts = separated_string_thoughts[1] + +print("\n The thoughts are\n") +print(thoughts) +print("\nThe actual output is\n") +print(output) diff --git a/test.xodr b/test.xodr new file mode 100644 index 0000000..e7a39ed --- /dev/null +++ b/test.xodr @@ -0,0 +1,269 @@ + + +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+
+
+ + + + + + + + + + + + + + +
+
+ + + + + + + + + + +
+
+ + + + + + + + + + +
+
+ + + + + + + + + + +
+
+ + + + + + + + + + + +
+
+ + + + + + + + + + + +
+
+ + + + + + + + + + + +
+
+ + + + + + + + + + + +
+
+ +
\ No newline at end of file diff --git a/test_deepinfra.py b/test_deepinfra.py new file mode 100644 index 0000000..b021c1a --- /dev/null +++ b/test_deepinfra.py @@ -0,0 +1,16 @@ +from openai import OpenAI +import os + +# Create an OpenAI client with your deepinfra token and endpoint +openai = OpenAI( + api_key=os.environ["DEEPINFRA_API_KEY"], + base_url="https://api.deepinfra.com/v1/openai", +) + +chat_completion = openai.chat.completions.create( + model="deepseek-ai/DeepSeek-R1", + messages=[{"role": "user", "content": "Hello"}], +) + +print(chat_completion.choices[0].message.content) +print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens) \ No newline at end of file