# coding: utf-8 # In[ ]: import math def load_dataset(): dataset = [] with open("dataset.txt","r") as f: for line in f: try: label, email = line.split('\t') email = email.replace("\n","").replace("."," ").replace(","," ").replace("?"," ").replace("!"," ").replace("`"," ").replace("'"," ").lower().split(" ") dataset.append([email, label]) except: pass return dataset dataset = load_dataset() print(dataset[0]) # In[ ]: def compute_class_probabilities(dataset): class_probabilities = {} class_counts = {} # Insert code here # class_counts[y] should be number of times y occurs in the dataset # class_probabilities[y] should be MLE for p(y) return class_counts, class_probabilities class_counts, class_probabilities = compute_class_probabilities(dataset[:4000]) print(class_probabilities) # In[ ]: def build_vocabulary(dataset): vocabulary = [] # Insert code here # vocabulary should be list of all words in dataset return vocabulary vocabulary = build_vocabulary(dataset[:4000]) # In[ ]: def compute_word_probabilities(dataset, vocabulary, class_counts): word_counts = {} #{"spam":{}, "ham":{}} word_probabilities = {} #{"spam":{}, "ham":{}} # Insert code here # word_counts[y][word] should be number of times word v occurs for e-mails of class y # word_probabilities[y][word] should be MLE for p(word|y) return word_counts, word_probabilities word_counts, word_probabilities = compute_word_probabilities(dataset[:4000], vocabulary, class_counts) print(word_probabilities) # In[ ]: def classify(email, vocabulary, class_probabilities, word_probabilities): posterior_logprobs = {} # Insert code here # posterior_probs[y] should be log p(email, y) = log p(y) + log p(email|y) # Prediction should be argmax_y log p(email, y) return posterior_probs, prediction posterior_probs, prediction = classify(dataset[0][0], vocabulary, class_probabilities, word_probabilities) print(dataset[0][0], posterior_probs, prediction) # In[ ]: def evaluate_model(dataset, vocabulary, class_probabilities, word_probabilities): num_correct = 0 # Insert code here # num_correct should be number of times prediction matches the e-mails class return num_correct / len(dataset) accuracy = evaluate_model(dataset[:4000], vocabulary, class_probabilities, word_probabilities) print(accuracy) # In[ ]: accuracy = evaluate_model(dataset[4000:], vocabulary, class_probabilities, word_probabilities) print(accuracy) # In[ ]: # Insert code here # most_spam should be sorted list of words, according to p(word|spam) print(most_spam[-50:]) # In[ ]: # Insert code here # most_ham should be sorted list of words, according to p(word|not spam) print(most_ham[-50:]) # In[ ]: # Insert code here # most_spam_ratio should be sorted list of words, according to p(word|spam) / p(word|not spam) print(most_spam_ratio[-50:])