defcount_tweets(result, tweets, ys): ''' Input: result: a dictionary that will be used to map each pair to its frequency tweets: a list of tweets ys: a list corresponding to the sentiment of each tweet (either 0 or 1) Output: result: a dictionary mapping each pair to its frequency '''
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### for y, tweet in zip(ys, tweets): for word in process_tweet(tweet): # define the key, which is the word and label tuple pair = (word, y)
# if the key exists in the dictionary, increment the count if pair in result: result[pair] += 1
# else, if the key is new, add it to the dictionary and set the count to 1 else: result[pair] = 1 ### END CODE HERE ###
deftrain_naive_bayes(freqs, train_x, train_y): ''' Input: freqs: dictionary from (word, label) to how often the word appears train_x: a list of tweets train_y: a list of labels correponding to the tweets (0,1) Output: logprior: the log prior. (equation 3 above) loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above) ''' loglikelihood = {} logprior = 0
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# calculate V, the number of unique words in the vocabulary vocab = set([pair[0] for pair in freqs.keys()]) V = len(vocab)
# calculate N_pos and N_neg N_pos = N_neg = 0 for pair in freqs.keys(): # if the label is positive (greater than zero) if pair[1] > 0:
# Increment the number of positive words by the count for this (word, label) pair N_pos += freqs[pair]
# else, the label is negative else:
# increment the number of negative words by the count for this (word,label) pair N_neg += freqs[pair]
# Calculate D, the number of documents D = len(train_y)
# Calculate D_pos, the number of positive documents (*hint: use sum(<np_array>)) D_pos = 0 for num in train_y: if(num == 1): D_pos += 1
# Calculate D_neg, the number of negative documents (*hint: compute using D and D_pos) D_neg = D - D_pos
# For each word in the vocabulary... for word in vocab: # get the positive and negative frequency of the word if (word, 1) in freqs: freq_pos = freqs[(word, 1)] else: freq_pos = 0 if (word, 0) in freqs: freq_neg = freqs[(word, 0)] else: freq_neg = 0
# calculate the probability that each word is positive, and negative p_w_pos = (freq_pos + 1) / (N_pos + V) p_w_neg = (freq_neg + 1) / (N_neg + V)
# calculate the log likelihood of the word loglikelihood[word] = np.log(p_w_pos / p_w_neg) ### END CODE HERE ###
defnaive_bayes_predict(tweet, logprior, loglikelihood): ''' Input: tweet: a string logprior: a number loglikelihood: a dictionary of words mapping to numbers Output: p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number) ''' ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # process the tweet to get a list of words word_l = process_tweet(tweet)
# initialize probability to zero p = 0
# add the logprior p += logprior
for word in word_l:
# check if the word exists in the loglikelihood dictionary if word in loglikelihood: # add the log likelihood of that word to the probability p += loglikelihood[word]
deftest_naive_bayes(test_x, test_y, logprior, loglikelihood): """ Input: test_x: A list of tweets test_y: the corresponding labels for the list of tweets logprior: the logprior loglikelihood: a dictionary with the loglikelihoods for each word Output: accuracy: (# of tweets classified correctly)/(total # of tweets) """ accuracy = 0# return this properly
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### y_hats = [] for tweet in test_x: # if the prediction is > 0 if naive_bayes_predict(tweet, logprior, loglikelihood) > 0: # the predicted class is 1 y_hat_i = 1 else: # otherwise the predicted class is 0 y_hat_i = 0
# append the predicted class to the list y_hats y_hats.append(y_hat_i)
# error is the average of the absolute values of the differences between y_hats and test_y error = np.mean(np.absolute(y_hats-test_y)) # 计算平均值 # Accuracy is 1 minus the error accuracy = 1 - error
defget_ratio(freqs, word): ''' Input: freqs: dictionary containing the words word: string to lookup Output: a dictionary with keys 'positive', 'negative', and 'ratio'. Example: {'positive': 10, 'negative': 20, 'ratio': 0.5} ''' pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0} ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # use lookup() to find positive counts for the word (denoted by the integer 1) pos_neg_ratio['positive'] = lookup(freqs, word, 1)
# use lookup() to find negative counts for the word (denoted by integer 0) pos_neg_ratio['negative'] = lookup(freqs, word, 0)
# calculate the ratio of positive to negative counts for the word pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1) / (pos_neg_ratio['negative'] + 1) ### END CODE HERE ### return pos_neg_ratio
defget_words_by_threshold(freqs, label, threshold): ''' Input: freqs: dictionary of words label: 1 for positive, 0 for negative threshold: ratio that will be used as the cutoff for including a word in the returned dictionary Output: word_list: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts. example of a key value pair: {'happi': {'positive': 10, 'negative': 20, 'ratio': 0.5} } ''' word_list = {}
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### for key in freqs.keys(): word, _ = key
# get the positive/negative ratio for a word pos_neg_ratio = get_ratio(freqs, word)
# if the label is 1 and the ratio is greater than or equal to the threshold... if label == 1and pos_neg_ratio["ratio"] >= threshold:
# Add the pos_neg_ratio to the dictionary word_list[word] = pos_neg_ratio
# If the label is 0 and the pos_neg_ratio is less than or equal to the threshold... elif label == 0and pos_neg_ratio["ratio"] <= threshold:
# Add the pos_neg_ratio to the dictionary word_list[word] = pos_neg_ratio
# otherwise, do not include this word in the list (do nothing)
### END CODE HERE ### return word_list
Part 5: 错误分析
可以观察一下模型判断错误的tweet都是什么样的:
Truth
Predicted
Tweet
1
0
b’’
1
0
b’truli later move know queen bee upward bound movingonup’