Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
78c94b1
first
dataei Feb 20, 2025
e14c62d
k
dataei Feb 20, 2025
6810c58
merge conflicts
Feb 26, 2025
7035d96
merge conflicts
Feb 26, 2025
c962628
comment test
Feb 27, 2025
9d54926
first
dataei Feb 20, 2025
0792497
k
dataei Feb 20, 2025
bdd9789
merge conflicts
Feb 26, 2025
e6735bb
merge conflicts
Feb 26, 2025
cea730b
comment test
Feb 27, 2025
b963219
first
dataei Feb 20, 2025
cea0646
modifying to deepinfra instead of deepseek and changed file path to t…
Feb 27, 2025
97bbf86
debugging with print statements, planning direct prompting variables …
Feb 28, 2025
9f1cefe
added 2,4,6,8 shot direct prompting and moved writing script out of f…
Feb 28, 2025
a60c014
Merge branch 'dev_denise' of https://github.com/AugmentedDesignLab/Ca…
Feb 28, 2025
f893f7e
Modifications to model names and corresponding evals can now be compl…
ishaan95 Feb 28, 2025
fca4c75
added 2,4,6,8 shot direct prompting and moved writing script out of f…
Feb 28, 2025
faf4d9b
syntax conflict
Mar 3, 2025
70e49e9
llm_qa_direct_only: added to model dictionary, incorporated CoT promp…
Mar 8, 2025
6cab612
ran more experiments to find pattern in small, medium, large file siz…
Mar 23, 2025
6720e73
in parse_scenario_womd: modified womd preprocessing syntax to more ac…
Mar 25, 2025
8ab5c32
modified parse scenario script to omit first file in script. index (5…
Mar 25, 2025
5e5df63
latest
ishaan95 Mar 26, 2025
e4bc684
merge dev_denise with dev_Ishaan
ishaan95 Mar 26, 2025
a28cb92
Removing planner import
ishaan95 Mar 26, 2025
295ad3d
incorporated word count into planner.py, drafting automation for corr…
Mar 27, 2025
caca3a9
renaming of folders for better organization between examples ran for …
Mar 27, 2025
19717bc
add grades folder to gitignore
ishaan95 Apr 8, 2025
1626387
merge conflict resolution
ishaan95 Apr 8, 2025
4e1e3ac
merge conflict resolution
ishaan95 Apr 8, 2025
9abc262
Calculate the most similar scenarios to a given index
ishaan95 Apr 9, 2025
7096db4
Search similar scenarios to a specific index and with respect to a gi…
ishaan95 Apr 9, 2025
857da20
reorganizing files and adding boxplots + creation script. fixed sizin…
Apr 10, 2025
7f28e5b
embedding space analysis, parallel experiments
ishaan95 Apr 22, 2025
fb99033
Modifications to model names and corresponding evals can now be compl…
ishaan95 Feb 28, 2025
28845b5
latest
ishaan95 Mar 26, 2025
a8986fa
Removing planner import
ishaan95 Mar 26, 2025
442fea4
add grades folder to gitignore
ishaan95 Apr 8, 2025
c46ae04
Calculate the most similar scenarios to a given index
ishaan95 Apr 9, 2025
96c7cf7
Search similar scenarios to a specific index and with respect to a gi…
ishaan95 Apr 9, 2025
d4f00ef
embedding space analysis, parallel experiments
ishaan95 Apr 22, 2025
92e1495
experimenting w/ llm as a judge prompting
Apr 10, 2025
ccab94d
placed data collection outside of project folder
Apr 10, 2025
b1c953c
renamed box plotting script, added negative prompting (concise) to ru…
Apr 22, 2025
9d8a5de
modifications to negative prompting
Apr 23, 2025
3ae34b1
merge commit
ishaan95 Apr 23, 2025
38abeb5
Reorganized the llm evaluation script
ishaan95 Apr 24, 2025
7cd3f39
modified parse scenario to my file path
Apr 30, 2025
38e7009
lecturing prompting has been added. PDDL testing for the latest versi…
ishaan95 Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
apla-planner/generated_pddls_deepseek/dataset/*
apla-planner/generated_pddls_deepseek/logs/*
apla-planner/*
generated_pddls/*
parsed_womdr_data/*
pddl-examples/*
*.pddl
__pycache__/*
apla-planner/generated_pddls_deepseek/.DS_Store
v3-grades/*
plt-graph-v3/*
cot_file_size_graphs/*
abc_bar_graph/*
grades/*
plt-graph/*
9 changes: 4 additions & 5 deletions apla-planner/generated_pddls_deepseek/planner_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from jupyddl import AutomatedPlanner # Comment this line along with the other planner lines if running from outside WSL
import os
import json
import matplotlib.pyplot as plt

## There is one context per scenario. Each context has a corresponding PDDL domain file.
## Each scenario has multiple interactions. Each interaction will have one PDDL problem file.
Expand All @@ -22,11 +21,12 @@

# We will traverse the problem list since there will be only one domain per scenario

plans_for_one_scenario = {}
problem_coverage_scores = []
problem_initial_state_sizes = []
print("Scenario ID is {}".format(scenario_folder))
for problem_file_name in problems_within_scenario:
print(f"Considering problem file {problem_file_name}")
plans_for_one_scenario = {}
problem_full_path = "dataset/problems/"+scenario_folder+"/"+problem_file_name
domain_full_path = "dataset/domains/"+scenario_folder+"/"+domains_within_scenario[0]
print("Planner is now running for the problem {}".format(problem_file_name))
Expand All @@ -39,6 +39,5 @@
except:
continue


with open("dataset/problems/"+scenario_folder+"/plan_set.json", 'w') as file:
json.dump(plans_for_one_scenario, file)
with open("dataset/problems/"+scenario_folder+"/"+problem_file_name+"_"+"plan_set.json", 'w') as file:
json.dump(plans_for_one_scenario, file)
194 changes: 194 additions & 0 deletions basic_scenario_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
from guidance import models, gen, user, assistant, system
import parse_scenario_womd
import json


def generate_scenario_concepts(granularity, scenario_data):
gpt_scenario = models.OpenAI(model="gpt-4o", echo=False)

with system():
lm_scenario = gpt_scenario

with user():
lm_scenario += f"""
Think deeply about scenarios for testing autonomous vehicles.

I need some states of the world that would be relevant for logically describing this traffic scenario:
{scenario_data}

A state is just an assertion with a true or false value that's representing the world in that particular moment.
This is similar to the concept of a turn in a turn based game.

There must be states regarding the following concepts:
* Static environment description.
* Ego agent
* The respective surrounding agents.

In each action and state, the ego agent or the surrounding agent must be identified as <EGO> or <SURROUNDING AGENT #0> or <SURROUNDING AGENT #1> as needed.

Increase the granularity of the concepts in proportion to the granularity level.
The granularity level is {str(granularity)} on a scale of 1 to 10 with 1 being the least and 10 being the most granular
Granularity pertains to how specific the information is.

Make sure to rewrite the concepts given in the generated list of concepts in addition to your concepts.
"""

with assistant():
lm_scenario += gen("concepts", temperature=0.5)

print("The scenario concepts are {}".format(lm_scenario["concepts"]))
return lm_scenario["concepts"]

def generate_scenario_states(concepts):
gpt_scenario = models.OpenAI(model="gpt-4o", echo=False)

with system():
lm_scenario = gpt_scenario

with user():
lm_scenario += f"""
Based on the concepts detailed in {concepts},
Write down a list of states pertaining to these concepts in natural language. Write them in the following format:
```json
<curly bracket>
"<state name>": <curly bracket>
"statement": "<the assertion in natural language. Use the fewest words possible for maximum clarity>
<close curly bracket>,
"<state name>": <open curly bracket>
"statement": "<the assertion in natural language>,
<close curly bracket>,
...
<close curly bracket>
json```

Be very very very specific.
"""

with assistant():
lm_scenario += gen("state_dictionary", temperature=0.5)

return lm_scenario["state_dictionary"]

def generate_scenario_actions(concepts, granularity=2):
gpt_scenario = models.OpenAI(model="gpt-4o", echo=False)

with system():
lm_scenario = gpt_scenario

with user():
lm_scenario += f"""
Based on the concepts detailed in {concepts},
* Write down a list of actions that map between these states in natural language.
* Each action has some causal states (predicates) and some effect states that will be true or false.
* Each action is a cause and effect mapping between any number of causal states and any number of effect states.
* Actions and states must not contradict each other.
* Action names must be descriptive and the action can be understood just by looking at the name.
* The state names within each action are also descriptive. The cause and effect statements and the state names must have the same information.
* There must be separate states regarding the environment, ego and the respective surrounding agents.
* In each action and state, the ego agent or the surrounding agent must be identified as <EGO> or <SURROUNDING AGENT #0> or <SURROUNDING AGENT #1> as needed.
* For distances, positions and speeds do not use specific numbers but words instead such as front, left, right, near, far, fast, slow, medium (or combinations such as front-left and so on) or other similar descriptive words.
* The action itself will only become true when the causal states and the effect states are in the specific states that this description details.
* Write them in the following format:
```json
<open curly bracket>
"<action name>":
<open curly bracket>
"<state name>": <open curly bracket>
"statement": "<the assertion in natural language. Use the fewest words possible for maximum clarity>
"value": <Whether this value is true for false>,
"state_type": <whether this state is a cause or effect for the current action>
<close curly bracket>,
"<state name>": <curly bracket>
"statement": "<the assertion in natural language. Use the fewest words possible for maximum clarity>
"value": <Whether this value is true for false>,
"state_type": <whether this state is a cause or effect for the current action>
<close curly bracket>
<close curly bracket>,
...
<close curly bracket>
json```

Increase the granularity of these actions in proportion to the granularity level.
Granularity pertains to how specific the information is.
While the actions must be relevant to the given scenario, they must be general enough to be used for other scenarios as well.
The granularity level is {str(granularity)} on a scale of 1 to 10 with 1 being the least and 10 being the most granular

"""

with assistant():
lm_scenario += gen("action_dictionary", temperature=0.8)

print("The scenario actions are {}".format(lm_scenario["action_dictionary"]))
return lm_scenario["action_dictionary"]

# # Removed from this project after consideration
# def generate_scenario_states(concepts):
# gpt_scenario = models.OpenAI(model="gpt-4o", echo=False)

# with system():
# lm_scenario = gpt_scenario

# with user():
# lm_scenario += f"""
# Based on the concepts detailed in {concepts},
# Write down a list of states pertaining to these concepts in natural language. Write them in the following format:
# ```json
# <curly bracket>
# "<state name>": <curly bracket>
# "statement": "<the assertion in natural language. Use the fewest words possible for maximum clarity>
# <close curly bracket>,
# "<state name>": <open curly bracket>
# "statement": "<the assertion in natural language>,
# <close curly bracket>,
# ...
# <close curly bracket>
# json```

# Be very very very specific and granular. Very granualar, fine details and specific.
# """

# with assistant():
# lm_scenario += gen("state_dictionary", temperature=0.8)

# return lm_scenario["state_dictionary"]

def respond_scenario_query(concepts, actions, questions):
gpt_scenario = models.OpenAI(model="gpt-4o", echo=False)

with system():
lm_scenario = gpt_scenario

with user():
lm_scenario += f"""
Based on the concepts detailed in {concepts} and actions detailed in {actions}, respond to the following questions:
{questions}
Be very specific and very granular. Very granual, fine details and specific.
"""

with assistant():
lm_scenario += gen("scenario_response", temperature=0.8)

#print("The scenario responses are {}".format(lm_scenario["scenario_response"]))
return lm_scenario["scenario_response"]

def evaluate_gpt(question):
gpt_scenario = models.OpenAI(model="gpt-4o-mini", echo=False)

with system():
lm_scenario = gpt_scenario

with user():
lm_scenario += f"""
Given the questions here:
{question}

Choose the correct answer. Only mention the option.
"""

with assistant():
lm_scenario += gen("mcq_response", temperature=0.5)

#print("The scenario responses are {}".format(lm_scenario["scenario_response"]))
return lm_scenario["mcq_response"]


34 changes: 34 additions & 0 deletions box_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#manually changed indices and scores for each graph
exp_and_scores = {
'Zero-Shot': [6, 3, 2, 1, 3, 2, 8, 8, 2, 10, 10, 3, 10, 2, 2, 8, 7, 4, 2, 5, 10, 10, 4, 10, 3, 3, 10, 2, 1, 2, 9, 10, 8, 8, 3, 10, 6, 3, 6, 8, 8, 2, 6, 6, 8, 3, 2, 5, 8, 2, 10, 1, 8, 10, 6, 7, 7, 8, 8, 5, 6, 10, 10, 4],
'Two-Shot': [4, 10, 10, 10, 10, 4, 10, 8, 8, 10, 10, 2, 2, 10, 4, 8, 7, 10, 10, 10, 10, 8, 10, 10, 10, 10, 10, 10, 10, 10, 9, 10, 6, 8, 4, 5, 4, 2, 6, 3, 9, 7, 7, 4, 10, 6, 10, 10, 10, 10, 10, 5, 6, 4, 8, 3, 10, 10, 10, 10, 10, 10, 10, 3],
'Four-Shot': [3, 10, 1, 2, 2, 2, 8, 9, 9, 10, 10, 2, 2, 10, 3, 9, 6, 10, 8, 10, 10, 10, 5, 10, 10, 10, 10, 10, 10, 10, 7, 10, 6, 5, 3, 4, 6, 3, 3, 2, 7, 2, 9, 4, 10, 2, 10, 10, 10, 10, 10, 10, 5, 3, 4, 2, 8, 10, 10, 10, 10, 10, 8, 2],
'Six-Shot': [4, 10, 1, 10, 3, 4, 6, 10, 10, 10, 8, 2, 2, 10, 3, 7, 6, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10, 7, 4, 2, 8, 4, 3, 4, 6, 8, 4, 10, 3, 9, 6, 10, 10, 10, 8, 10, 2, 5, 6, 3, 8, 5, 10, 9, 10, 10, 10, 10, 3]
}

data = []
for experiment, scores in exp_and_scores.items():
score_array = np.array(scores)
sorted_array = np.sort(score_array)
for individual_score in sorted_array:
data.append({'CoT Prompting Style': experiment, 'Correctness Scores': individual_score})

df = pd.DataFrame(data)
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))
#creates the box plots
ax = sns.boxplot(x='CoT Prompting Style', y='Correctness Scores', data=df, width=0.5, fliersize=0)
#inserting data points
sns.stripplot(x='CoT Prompting Style', y='Correctness Scores', data=df, jitter=0.23, color='black', size=6, alpha=0.7)
for i, (experiment, scores) in enumerate(exp_and_scores.items()):
q1_label = np.percentile(scores, 25)
ax.text(i, q1_label, f'Q1: {q1_label:.2f}', ha = 'center', va = 'bottom', color = 'white', fontsize = 12)

plt.ylim(0, 11)
plt.title('Zero, Two, Four, and Six-Shot CoT Prompting Score Distribution for Scenarios of Large Files')
plt.show()
13 changes: 10 additions & 3 deletions client_model_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ProvidedLLM():
def __init__(self):
self.client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
self.client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai")
self.client_dsapi = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
#self.client_dsapi = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")

# The following are model names for DS models provided via their own API service.
self.ds_v3_dsapi = "deepseek-chat"
Expand All @@ -18,6 +18,7 @@ def __init__(self):
# The following are model names for Large DeepInfra provided models
self.ds_v3 = "deepseek-ai/DeepSeek-V3"
self.ds_r1 = "deepseek-ai/DeepSeek-R1" # This model thinks. Cannot use for json output
self.ds_r1_turbo = "deepseek-ai/DeepSeek-R1-Turbo"
self.llama_33_70b = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
self.llama_31_405b = "meta-llama/Meta-Llama-3.1-405B-Instruct"
self.qw_25_72b = "Qwen/Qwen2.5-72B-Instruct"
Expand All @@ -32,6 +33,8 @@ def __init__(self):
# The following are the small model names for models provided via the OpenAI API service
self.gpt_4o_mini = "gpt-4o-mini"
self.o3_mini = "o3-mini"
self.gpt_45 = "gpt-4.5-preview"
self.gpt_41 = "gpt-4.1"

self.model_dictionary = {
"openai_models": [self.gpt_4o_mini, self.o3_mini],
Expand All @@ -53,7 +56,11 @@ def non_thinking_llm_call(self, client, model, prompt):
# DS api reasoner doesn't send think tags so no need for this function.
# Deepinfra thinking models send these tags so this function is needed.
def thinking_llm_call(self, client, model, prompt):
output = self.llm_call(client=client, model=model, prompt=prompt)
output_content = client.chat.completions.create(model=model,
messages=[{"role": "user", "content": prompt}],
stream=False
)
output = output_content.choices[0].message.content
separated_string = re.split(r"(</think>)", output)
separated_string_thoughts = re.split(r"(<think>)", separated_string[0])
separated_string_output = separated_string[2]
Expand All @@ -63,7 +70,7 @@ def thinking_llm_call(self, client, model, prompt):
def llm_call(self, client, model, prompt):
output = ""
thoughts = ""
if (model==self.ds_r1) or (model==self.ds_distil_llama_70b):
if (model==self.ds_r1) or (model==self.ds_distil_llama_70b) or (model==self.ds_r1_turbo):
output, thoughts = self.thinking_llm_call(client, model, prompt)
else:
output = self.non_thinking_llm_call(client, model, prompt)
Expand Down
Loading