number_of_actions = 5
max_value = 2
Prev_max_value = 0
costs = [0.0 for i in range(number_of_actions)]
rewards = [0.0 for i in range(number_of_actions)]
action_counts = [0 for i in range(number_of_actions)]
q_values = [0.0 for i in range(number_of_actions)]
max_val_updated = False
delays_history = [[] for i in range(number_of_actions)]
for i in range(10):
if i < number_of_actions:
action = i
else:
action = q_values.index(max(q_values))
# for action in range(number_of_actions):
action_counts[action] = action_counts[action] + 1
delay = random.randint(0, 20)
costs[action] = delay / max_value
rewards[action] = rewards[action] + (1 - costs[action])
q_values[action] = rewards[action] / float(action_counts[action])
delays_history[action].append(delay)
print(f"First Q_values: {q_values}")
if delay > max_value:
Prev_max_value = max_value
max_value = delay
max_val_updated = True
# delay = random.randint(0,20)
costs = [0.0 for i in range(number_of_actions)]
for action2 in range(number_of_actions):
summ = 0.0
for d in delays_history[action2]:
cost = d / max_value
summ += 1 - cost
costs[action2] = costs[action2] + cost
q_values[action2] = summ / len(delays_history[action2])
rewards[action2] = summ
second_qvalues = q_values.copy()
print(f"Second Q_values: {q_values}")
for action2 in range(number_of_actions):
N = len(delays_history[action2])
q_values[action2] = (N - ((N - (q_values[action2] * N)) * Prev_max_value) / float(max_value)) / float(N)
third_qvalues = q_values.copy()
print(f"Third Q_values: {q_values}")
print()