defget_matrices(en_fr, french_vecs, english_vecs): """ Input: en_fr: English to French dictionary french_vecs: French words to their corresponding word embeddings. english_vecs: English words to their corresponding word embeddings. Output: X: a matrix where the columns are the English embeddings. Y: a matrix where the columns correspong to the French embeddings. R: the projection matrix that minimizes the F norm ||X R -Y||^2. """
# X_l and Y_l are lists of the english and french word embeddings X_l = list() Y_l = list()
defcompute_loss(X, Y, R): ''' Inputs: X: a matrix of dimension (m,n) where the columns are the English embeddings. Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings. R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings. Outputs: L: a matrix of dimension (m,n) - the value of the loss function for given X, Y and R. '''
m = X.shape[0] # 行数 diff = np.dot(X, R) - Y # XR - Y
diff_squared = np.square(diff)
sum_diff_squared = np.sum(diff_squared)
loss = sum_diff_squared / m return loss
1.3 计算转换矩阵R的损失梯度
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
defcompute_gradient(X, Y, R): ''' Inputs: X: a matrix of dimension (m,n) where the columns are the English embeddings. Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings. R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings. Outputs: g: a scalar value - gradient of the loss function L for given X, Y and R. '''
defalign_embeddings(X, Y, train_steps=100, learning_rate=0.0003): ''' Inputs: X: a matrix of dimension (m,n) where the columns are the English embeddings. Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings. train_steps: positive int - describes how many steps will gradient descent algorithm do. learning_rate: positive float - describes how big steps will gradient descent algorithm do. Outputs: R: a matrix of dimension (n,n) - the projection matrix that minimizes the F norm ||X R -Y||^2 ''' np.random.seed(129)
R = np.random.rand(X.shape[1], X.shape[1]) # 初始化R
for i in range(train_steps): if i % 25 == 0: print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}") gradient = compute_gradient(X, Y, R) R -= learning_rate * gradient return R
defnearest_neighbor(v, candidates, k=1): """ Input: - v, the vector you are going find the nearest neighbor for - candidates: a set of vectors where we will find the neighbors - k: top k nearest neighbors to find Output: - k_idx: the indices of the top k closest vectors in sorted form """
similarity_l = []
for row in candidates: cos_similarity = cosine_similarity(row, v)
deftest_vocabulary(X, Y, R): ''' Input: X: a matrix where the columns are the English embeddings. Y: a matrix where the columns correspong to the French embeddings. R: the transform matrix which translates word embeddings from English to French word vector space. Output: accuracy: for the English to French capitals '''
pred = np.dot(X, R)
num_correct = 0
for i in range(len(pred)): pred_idx = nearest_neighbor(pred[i], Y)
if pred_idx == i: num_correct += 1
accuracy = num_correct / pred.shape[0]
return accuracy
Part2: LSH和文档搜索
2.1 计算文档的embedding
预处理文档,去除杂音
将文档中各个单词的embedding相加即可
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defget_document_embedding(tweet, en_embeddings): ''' Input: - tweet: a string - en_embeddings: a dictionary of word embeddings Output: - doc_embedding: sum of all word embeddings in the tweet ''' doc_embedding = np.zeros(300) # 单词都是300维的向量
processed_doc = process_tweet(tweet) for word in processed_doc: doc_embedding += en_embeddings.get(word, 0)
defget_document_vecs(all_docs, en_embeddings): ''' Input: - all_docs: list of strings - all tweets in our dataset. - en_embeddings: dictionary with words as the keys and their embeddings as the values. Output: - document_vec_matrix: matrix of tweet embeddings. - ind2Doc_dict: dictionary with indices of tweets in vecs as keys and their embeddings as the values. '''
# the dictionary's key is an index (integer) that identifies a specific tweet # the value is the document embedding for that document ind2Doc_dict = {}
# this is list that will store the document vectors document_vec_l = []
defhash_value_of_vector(v, planes): """Create a hash for a vector; hash_id says which random hash to use. Input: - v: vector of tweet. It's dimension is (1, N_DIMS) - planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region Output: - res: a number which is used as a hash for your vector """ # for the set of planes, # calculate the dot product between the vector and the matrix containing the planes # remember that planes has shape (300, 10) # The dot product will have the shape (1,10) dot_product = np.dot(v, planes)
defmake_hash_table(vecs, planes): """ Input: - vecs: list of vectors to be hashed. - planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes). Output: - hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets) - id_table: dictionary - keys are hashes, values are list of vectors id's (it's used to know which tweet corresponds to the hashed vector) """
num_of_planes = planes.shape[1] # 平面数量
num_buckets = 2 ** num_of_planes
hash_table = {} for i in range(num_buckets): hash_table[i] = [] id_table = {} for i in range(num_buckets): id_table[i] = []
for i, v in enumerate(vecs): h = hash_value_of_vector(v, planes) # 计算哈希值
defapproximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES): """Search for k-NN using hashes.""" assert num_universes_to_use <= N_UNIVERSES
# Vectors that will be checked as possible nearest neighbor vecs_to_consider_l = list()
# list of document IDs ids_to_consider_l = list()
# create a set for ids to consider, for faster checking if a document ID already exists in the set ids_to_consider_set = set()
for universe_id in range(num_universes_to_use):
planes = planes_l[universe_id]
hash_value = hash_value_of_vector(v, planes)
hash_table = hash_tables[universe_id]
document_vectors_l = hash_table[hash_value]
id_table = id_tables[universe_id]
new_ids_to_consider = id_table[hash_value]
if doc_id in new_ids_to_consider: new_ids_to_consider.remove(doc_id) # 删除我们正在寻找的doc_id print(f"removed doc_id {doc_id} of input vector from new_ids_to_search")
for i, new_id in enumerate(new_ids_to_consider):
if new_id notin ids_to_consider_set: document_vector_at_i = document_vectors_l[i] vecs_to_consider_l.append(document_vector_at_i)
ids_to_consider_l.append(new_id)
# (use this to check if new_id is not already in the IDs to consider) ids_to_consider_set.add(new_id)
# Now run k-NN on the smaller set of vecs-to-consider. print("Fast considering %d vecs" % len(vecs_to_consider_l))
# convert the vecs to consider set to a list, then to a numpy array vecs_to_consider_arr = np.array(vecs_to_consider_l)
# call nearest neighbors on the reduced list of candidate vectors nearest_neighbor_idx_l = nearest_neighbor(v, vecs_to_consider_arr, k=k)
# Use the nearest neighbor index list as indices into the ids to consider # create a list of nearest neighbors by the document ids nearest_neighbor_ids = [ids_to_consider_l[idx] for idx in nearest_neighbor_idx_l]
print(f"Nearest neighbors for document {doc_id}") print(f"Document contents: {doc_to_search}") print("")
for neighbor_id in nearest_neighbor_ids: print(f"Nearest neighbor at document id {neighbor_id}") print(f"document contents: {all_tweets[neighbor_id]}")
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
result:
removed doc_id 0 of input vector from new_ids_to_search removed doc_id 0 of input vector from new_ids_to_search removed doc_id 0 of input vector from new_ids_to_search removed doc_id 0 of input vector from new_ids_to_search removed doc_id 0 of input vector from new_ids_to_search Fast considering 77 vecs
Nearest neighbors for document 0 Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Nearest neighbor at document id 2140 document contents: @PopsRamjet come one, every now and then isnot so bad :) Nearest neighbor at document id 701 document contents: With the top cutie of Bohol :) https://t.co/Jh7F6U46UB Nearest neighbor at document id 51 document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)