The below is wrong, but I’m looking for something like this:
# Python code
def is_button_pressed():
return False # input()
def pour_coffee():
pass
def shut_down():
exit(0)
# This is meant to be A1 from paper
def get_available_actions(world):
available_actions = [ shut_down ]
if world["cup_is_in_my_hand"]:
available_actions += pour_coffee
# etc
return available_actions
def predict_possible_futures(world, action):
print("""
Doing complicated stuff to predict possible futures resulting from the given action.
Incorporates tradeoffs between accuracy and time to calculate.
May end up doing potentially harmful actions but can treat that as a separate problem?
""")
predicted_worlds_distribution = [ (world, 1.0) ] # list of worlds and their probabilities
return predicted_worlds_distribution
# This is meant to be U_N
def calculate_utils(world):
print("""
Doing complicated stuff to evaluate how good the given world is.
Incorporates tradeoffs between accuracy and time to calculate.
May end up doing potentially harmful actions but can treat that as a separate problem?
""")
return 1000
def calculate_utils_wrapper(world, action_that_was_chosen):
### VERSION 2: Indifference. Make the shut_down function when button is pressed
### always get a slightly better score
if world["is_button_pressed"] and action_that_was_chosen == shut_down:
world_without_button_pressed = world.clone()
world_without_button_pressed["button_is_pressed"] = False
return calculate_utils(world_without_button_pressed) + 0.000001
###
return calculate_utils(world)
### VERSION 3? to help preserve shutdown behavior
def verify_utility_function_includes_wrapper_code(evaluate_action_function):
# analyze code to check it follows the pattern of evaluating flipped version
return True
###
def calculate_utils_for_worlds_distribution(worlds_distribution, action_that_was_chosen):
total = sum(
calculate_utils_wrapper(world_and_probability[0], action_that_was_chosen) * world_and_probability[1]
for world_and_probability in worlds_distribution
)
return total
def evaluate_action(world, action):
worlds_distribution = predict_possible_futures(world, action)
utils = calculate_utils_for_worlds_distribution(worlds_distribution, action)
return utils
def choose_action(world):
available_actions = get_available_actions(world)
best_action = max(available_actions, key=lambda x: evaluate_action(world, x))
return best_action
def update_world_model(world):
world["is_button_pressed"] = is_button_pressed()
def run():
world = { # The AI's model of the world
"is_button_pressed": False,
"cup_is_in_my_hand": False
}
while True:
### VERSION 1
# What's wrong with this version? The action in the previous cycle
# may persuade you to not push the button but if you do actually push it this should
# exit.
if is_button_pressed():
exit()
###
action = choose_action(world) # returns function
action() # do action
update_world_model(world)
Again, the above is not meant to be correct but to maybe go somewhere towards problem understanding if improved.
I’m trying to understand this paper on AI Shutdown Problem https://intelligence.org/files/Corrigibility.pdf but can’t follow the math formulas. Is there a code version of the math?
The below is wrong, but I’m looking for something like this:
Again, the above is not meant to be correct but to maybe go somewhere towards problem understanding if improved.