defcosine_similarity(A, B): ''' Input: A: a numpy array which corresponds to a word vector B: A numpy array which corresponds to a word vector Output: cos: numerical number representing the cosine similarity between A and B. '''
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### dot = np.dot(A, B) norma = np.linalg.norm(A) normb = np.linalg.norm(B) cos = dot / (norma * normb)
defeuclidean(A, B): """ Input: A: a numpy array which corresponds to a word vector B: A numpy array which corresponds to a word vector Output: d: A与B之间的欧氏距离 """
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# euclidean distance
d = np.sum((A - B) * (A - B)) d = np.sqrt(d)
### END CODE HERE ###
return d
1.2 寻找每个国家的首都
通过上述方法来计算单词向量间的相似度,并来查找各国的首都,如同King - Man + Woman = Queen
defget_country(city1, country1, city2, embeddings): """ Input: city1: a string (the capital city of country1) country1: a string (the country of capital1) city2: a string (the capital city of country2) embeddings: a dictionary where the keys are words and values are their embeddings Output: countries: a dictionary with the most likely country and its similarity score """ ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# store the city1, country1, and city2 in a set called group group = set((city1, country1, city2))
# get embeddings of city 1 city1_emb = embeddings[city1]
# get embedding of country 1 country1_emb = embeddings[country1]
# get embedding of city 2 city2_emb = embeddings[city2]
# get embedding of country 2 (it's a combination of the embeddings of country 1, city 1 and city 2) # Remember: King - Man + Woman = Queen vec = country1_emb - city1_emb + city2_emb
# Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1) similarity = -1
# initialize country to an empty string country = ''
# loop through all words in the embeddings dictionary for word in embeddings.keys():
# first check that the word is not already in the 'group' if word notin group:
# get the word embedding word_emb = embeddings[word]
# calculate cosine similarity between embedding of country 2 and the word in the embeddings dictionary cur_similarity = cosine_similarity(vec, word_emb)
# if the cosine similarity is more similar than the previously best similarity... if cur_similarity > similarity:
# update the similarity to the new, better similarity similarity = cur_similarity
# store the country as a tuple, which contains the word and the similarity country = (word, similarity)
defget_accuracy(word_embeddings, data): ''' Input: word_embeddings: a dictionary where the key is a word and the value is its embedding data: a pandas dataframe containing all the country and capital city pairs Output: accuracy: the accuracy of the model '''
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # initialize num correct to zero num_correct = 0
# loop through the rows of the dataframe for i, row in data.iterrows(): # get city1 city1 = row["city1"]
# get country1 country1 = row["country1"]
# get city2 city2 = row["city2"]
# get country2 country2 = row["country2"]
# use get_country to find the predicted country2 predicted_country2, _ = get_country(city1, country1, city2, word_embeddings)
# if the predicted country2 is the same as the actual country2... if predicted_country2 == country2: # increment the number of correct by 1 num_correct += 1
# get the number of rows in the data dataframe (length of dataframe) m = len(data)
# calculate the accuracy by dividing the number correct by m accuracy = num_correct / m
defcompute_pca(X, n_components=2): """ Input: X: of dimension (m,n) where each row corresponds to a word vector n_components: Number of components you want to keep. Output: X_reduced: data transformed in 2 dims/columns + regenerated original data """
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # mean center the data X_demeaned = X - np.mean(X, axis=0) # 必须为列
# 将数组从小到大的元素索引赋值给idx_sorted idx_sorted = np.argsort(eigen_vals) # reverse the order so that it's from highest to lowest. idx_sorted_decreasing = idx_sorted[::-1] # 逆置, 从大到小
# sort the eigen values by idx_sorted_decreasing eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]
print(eigen_vecs) # sort eigenvectors using the idx_sorted_decreasing indices eigen_vecs_sorted = eigen_vecs[:,idx_sorted_decreasing] # 将每个数组元素都逆置 print("\n", eigen_vecs_sorted)
# select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigen_vecs_subset = eigen_vecs_sorted[:,0:n_components] # 每行只要前两列
# transform the data by multiplying the transpose of the eigenvectors # with the transpose of the de-meaned data # Then take the transpose of that product. X_reduced = np.dot(eigen_vecs_subset.transpose(),X_demeaned.transpose()).transpose() # transpose 转置矩阵