1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
| def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False): ''' Input: data_pos - Set of posstive examples data_neg - Set of negative examples batch_size - number of samples per batch. Must be even loop - True or False vocab_dict - The words dictionary shuffle - Shuffle the data order Yield: inputs - Subset of positive and negative examples targets - The corresponding labels for the subset example_weights - An array specifying the importance of each example ''' assert batch_size % 2 == 0 n_to_take = batch_size // 2 pos_index = 0 neg_index = 0 len_data_pos = len(data_pos) len_data_neg = len(data_neg) pos_index_lines = list(range(len_data_pos)) neg_index_lines = list(range(len_data_neg)) if shuffle: rnd.shuffle(pos_index_lines) rnd.shuffle(neg_index_lines) stop = False while not stop: batch = [] for i in range(n_to_take): if pos_index >= len_data_pos: if not loop: stop = True; break; pos_index = 0 if shuffle: rnd.shuffle(pos_index_lines) tweet = data_pos[pos_index_lines[pos_index]] tensor = tweet_to_tensor(tweet, vocab_dict) batch.append(tensor) pos_index = pos_index + 1
for i in range(n_to_take): if neg_index >= len_data_neg if not loop: stop = True; break; neg_index = 0 if shuffle: rnd.shuffle(neg_index_lines)
tweet = data_neg[neg_index_lines[neg_index]] tensor = tweet_to_tensor(tweet, vocab_dict) batch.append(tensor) neg_index = neg_index + 1
if stop: break;
pos_index += n_to_take neg_index += n_to_take max_len = max([len(t) for t in batch]) tensor_pad_l = []
for tensor in batch: n_pad = max_len - len(tensor)
pad_l = [0] * n_pad
tensor_pad = tensor + pad_l tensor_pad_l.appenda(tensor_pad)
inputs = np.array(tensor_pad_l) target_pos = [1] * n_to_take target_neg = [0] * n_to_take target_l = target_pos + target_neg targets = np.array(target_l)
example_weights = np.ones_like(targets)
yield inputs, targets, example_weights
|