1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
| def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True): """Generator function that yields batches of data
Args: Q1 (list): List of transformed (to tensor) questions. Q2 (list): List of transformed (to tensor) questions. batch_size (int): Number of elements per batch. pad (int, optional): Pad character from the vocab. Defaults to 1. shuffle (bool, optional): If the batches should be randomnized or not. Defaults to True. Yields: tuple: Of the form (input1, input2) with types (numpy.ndarray, numpy.ndarray) NOTE: input1: inputs to your model [q1a, q2a, q3a, ...] i.e. (q1a,q1b) are duplicates input2: targets to your model [q1b, q2b,q3b, ...] i.e. (q1a,q2i) i!=a are not duplicates """
input1 = [] input2 = [] idx = 0 len_q = len(Q1) question_indexes = [*range(len_q)] if shuffle: rnd.shuffle(question_indexes) while True: if idx >= len_q: idx = 0 if shuffle: rnd.shuffle(question_indexes) q1 = Q1[question_indexes[idx]] q2 = Q2[question_indexes[idx]] idx += 1 input1.append(q1) input2.append(q2)
if len(input1) == batch_size: max_len = max(max([len(q) for q in input1]), max([len(q) for q in input2])) max_len = 2**int(np.ceil(np.log2(max_len)))
b1 = [] b2 = [] for q1, q2 in zip(input1, input2):
q1 = q1 + [pad] * (max_len - len(q1)) q2 = q2 + [pad] * (max_len - len(q2))
b1.append(q1) b2.append(q2)
yield np.array(b1), np.array(b2)
input1, input2 = [], []
|