cSkeleton comments on Open & Welcome Thread—November 2022

cSkeleton 26 Nov 2022 22:40 UTC

3 points

I’m trying to understand this paper on AI Shutdown Problem https://intelligence.org/files/Corrigibility.pdf but can’t follow the math formulas. Is there a code version of the math?

The below is wrong, but I’m looking for something like this:

# Python code

def is_button_pressed():
    return False  # input()

def pour_coffee():
    pass

def shut_down():
    exit(0)

# This is meant to be A1 from paper
def get_available_actions(world):
    available_actions = [ shut_down ]
    if world["cup_is_in_my_hand"]:
        available_actions += pour_coffee
    # etc
    return available_actions

def predict_possible_futures(world, action):
    print("""
        Doing complicated stuff to predict possible futures resulting from the given action. 
        Incorporates tradeoffs between accuracy and time to calculate.
        May end up doing potentially harmful actions but can treat that as a separate problem?
        """)
    predicted_worlds_distribution = [ (world, 1.0) ] # list of worlds and their probabilities
    return predicted_worlds_distribution


# This is meant to be U_N
def calculate_utils(world):
    print("""
        Doing complicated stuff to evaluate how good the given world is. 
        Incorporates tradeoffs between accuracy and time to calculate.
        May end up doing potentially harmful actions but can treat that as a separate problem?
        """)
    return 1000


def calculate_utils_wrapper(world, action_that_was_chosen):
    ### VERSION 2: Indifference. Make the shut_down function when button is pressed
    ### always get a slightly better score
    if world["is_button_pressed"] and action_that_was_chosen == shut_down:
        world_without_button_pressed = world.clone()
        world_without_button_pressed["button_is_pressed"] = False
        return calculate_utils(world_without_button_pressed) + 0.000001
    ###
    return calculate_utils(world)


### VERSION 3? to help preserve shutdown behavior
def verify_utility_function_includes_wrapper_code(evaluate_action_function):
    # analyze code to check it follows the pattern of evaluating flipped version
    return True
###


def calculate_utils_for_worlds_distribution(worlds_distribution, action_that_was_chosen):
    total = sum(
        calculate_utils_wrapper(world_and_probability[0], action_that_was_chosen) * world_and_probability[1]
        for world_and_probability in worlds_distribution
    )
    return total

def evaluate_action(world, action):
    worlds_distribution = predict_possible_futures(world, action)
    utils = calculate_utils_for_worlds_distribution(worlds_distribution, action)
    return utils

def choose_action(world):
    available_actions = get_available_actions(world)
    best_action = max(available_actions, key=lambda x: evaluate_action(world, x))
    return best_action

def update_world_model(world):
    world["is_button_pressed"] = is_button_pressed()

def run():
    world = { # The AI's model of the world
        "is_button_pressed": False,
        "cup_is_in_my_hand": False
    }
    while True:
        ### VERSION 1
        # What's wrong with this version? The action in the previous cycle
        # may persuade you to not push the button but if you do actually push it this should
        # exit.
        if is_button_pressed():
            exit()
        ###

        action = choose_action(world)  # returns function
        action() # do action
        update_world_model(world)

Again, the above is not meant to be correct but to maybe go somewhere towards problem understanding if improved.