# coding: utf-8

# In[ ]:


import math

def load_dataset(): 
    dataset = []
    with open("dataset.txt","r") as f:
        for line in f:
            try:
                label, email = line.split('\t')
                email = email.replace("\n","").replace("."," ").replace(","," ").replace("?"," ").replace("!"," ").replace("`"," ").replace("'"," ").lower().split(" ")
                dataset.append([email, label])
            except: pass
    return dataset
            
dataset = load_dataset()
print(dataset[0])


# In[ ]:


def compute_class_probabilities(dataset):
    class_probabilities = {}
    class_counts = {}

    # Insert code here
    # class_counts[y] should be number of times y occurs in the dataset
    # class_probabilities[y] should be MLE for p(y)
    
    return class_counts, class_probabilities

class_counts, class_probabilities = compute_class_probabilities(dataset[:4000])
print(class_probabilities)


# In[ ]:


def build_vocabulary(dataset):
    vocabulary = []
    
    # Insert code here
    # vocabulary should be list of all words in dataset

    return vocabulary

vocabulary = build_vocabulary(dataset[:4000])


# In[ ]:


def compute_word_probabilities(dataset, vocabulary, class_counts):
    word_counts = {} #{"spam":{}, "ham":{}}
    word_probabilities = {} #{"spam":{}, "ham":{}}
    
    # Insert code here
    # word_counts[y][word] should be number of times word v occurs for e-mails of class y
    # word_probabilities[y][word] should be MLE for p(word|y)
    
    return word_counts, word_probabilities

word_counts, word_probabilities = compute_word_probabilities(dataset[:4000], vocabulary, class_counts)
print(word_probabilities)


# In[ ]:


def classify(email, vocabulary, class_probabilities, word_probabilities): 
    posterior_logprobs = {}
    
    # Insert code here
    # posterior_probs[y] should be log p(email, y) = log p(y) + log p(email|y)
    # Prediction should be argmax_y log p(email, y)
    
    return posterior_probs, prediction

posterior_probs, prediction = classify(dataset[0][0], vocabulary, class_probabilities, word_probabilities)
print(dataset[0][0], posterior_probs, prediction)


# In[ ]:


def evaluate_model(dataset, vocabulary, class_probabilities, word_probabilities): 
    num_correct = 0
    
    # Insert code here
    # num_correct should be number of times prediction matches the e-mails class
    
    return num_correct / len(dataset)

accuracy = evaluate_model(dataset[:4000], vocabulary, class_probabilities, word_probabilities)
print(accuracy)


# In[ ]:


accuracy = evaluate_model(dataset[4000:], vocabulary, class_probabilities, word_probabilities)
print(accuracy)


# In[ ]:


# Insert code here
# most_spam should be sorted list of words, according to p(word|spam)

print(most_spam[-50:])


# In[ ]:


# Insert code here
# most_ham should be sorted list of words, according to p(word|not spam)

print(most_ham[-50:])


# In[ ]:


# Insert code here
# most_spam_ratio should be sorted list of words, according to p(word|spam) / p(word|not spam)

print(most_spam_ratio[-50:])