import io
import numpy as np

# S = {0, 1}
# A = {0, 1}
T = []
R = [] 
for s in range(0, 2):
    Ts = []
    Rs = [] 
    for a in range(0, 2):
        Ta = []
        Ra = [] 
        for sPrime in range(0, 2):
            TsasPrime = 0
            Ta.append(TsasPrime)
            RsasPrime = 0
            Ra.append(RsasPrime)
            
        Ts.append(Ta)
        Rs.append(Ra)

    T.append(Ts)
    R.append(Rs)

#print(T)
#print(R)

T[0][0][0] = 0.25
T[0][0][1] = 1 - T[0][0][0]
T[0][1][0] = 0.8
T[0][1][1] = 1 - T[0][1][0]
T[1][0][0] = 0.75
T[1][0][1] = 1 - T[1][0][0]
T[1][1][0] = 0.1
T[1][1][1] = 1 - T[1][1][0]

R[0][0][0] = 1
R[0][0][1] = 0
R[0][1][0] = 2
R[0][1][1] = 0
R[1][0][0] = 2
R[1][0][1] = 5
R[1][1][0] = -1
R[1][1][1] = 1

gamma = 0.5

def bellmanOptimalityOperator(V):
    #Write this in class
    Vnext = []
    Vnext.append(0)
    Vnext.append(0)

    for s in range(0, 2):
        q0 = 0
        # First action
        for sPrime in range(0, 2):
            q0 += T[s][0][sPrime] * (R[s][0][sPrime] + gamma * V[sPrime])
 
        q1 = 0
        for sPrime in range(0, 2):
            q1 += T[s][1][sPrime] * (R[s][1][sPrime] + gamma * V[sPrime])

        Vnext[s] = max(q0, q1)
        
    return Vnext

V = []
#V.append(10345.3)
#V.append(21.346576)
V.append(0)
V.append(0)

for i in range(0, 10000):
    V = bellmanOptimalityOperator(V)

print(V)
print("**********************")

# Agent's choice
def selectAction(Q, state, epsilon):
    if np.random.random() < epsilon:
        # Take an action uniformly at random
        action = np.random.randint(0, 2)
    else:
        # Select a maximising action
        if(Q[state][0] > Q[state][1]):
            action = 0
        else:
            action = 1

    return action

# Environment's transition
def getNext(state, action):
    nextState = 0
    if np.random.random() > T[s][a][0]:
        nextState = 1
    reward = R[state][action][nextState]
    return reward, nextState
    
Q = np.zeros((2, 2))
epsilon = 1.0
alpha = 0.01

maxSteps = 50000
s = 0
for t in range(maxSteps):
    a = selectAction(Q, s, epsilon)
    reward, nextState = getNext(s, a)
    #print("(" + str(s) + ", " + str(a) + ", " + str(reward) + ")")
    #Q-learning
    target = reward + gamma * max(Q[nextState][0], Q[nextState][1])

    Q[s][a] = Q[s][a] * (1 - alpha) + alpha * target
    
    s = nextState

print(Q)