鱼C论坛

 找回密码
 立即注册
查看: 1498|回复: 1

[技术交流] Python实现seq2seq【tensorflow2.3】

[复制链接]
发表于 2021-4-21 20:43:26 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 糖逗 于 2021-4-22 11:12 编辑

一、导入库
  1. import numpy as np
  2. import time
  3. import tensorflow as tf
  4. from tensorflow import keras
  5. from tensorflow.keras import layers, optimizers
  6. import tensorflow_addons as tfa
  7. import datetime
  8. from sklearn.model_selection import train_test_split
复制代码




二、日期数据生成
  1. PAD_ID = 0
  2. class DateData:
  3.     def __init__(self, n):
  4.         np.random.seed(1)
  5.         self.date_cn = []
  6.         self.date_en = []
  7.         for timestamp in np.random.randint(143835585, 2043835585, n):
  8.             date = datetime.datetime.fromtimestamp(timestamp)
  9.             self.date_cn.append(date.strftime("%y-%m-%d"))
  10.             self.date_en.append(date.strftime("%d/%b/%Y"))
  11.         self.vocab = set(
  12.             [str(i) for i in range(0, 10)] + ["-", "/", "<GO>", "<EOS>"] + [
  13.                 i.split("/")[1] for i in self.date_en])
  14.         self.v2i = {v: i for i, v in enumerate(sorted(list(self.vocab)), start=1)}
  15.         self.v2i["<PAD>"] = PAD_ID
  16.         self.vocab.add("<PAD>")
  17.         self.i2v = {i: v for v, i in self.v2i.items()}
  18.         self.x, self.y = [], []
  19.         for cn, en in zip(self.date_cn, self.date_en):
  20.             self.x.append([self.v2i[v] for v in cn])
  21.             self.y.append(
  22.                 [self.v2i["<GO>"], ] + [self.v2i[v] for v in en[:3]] + [
  23.                     self.v2i[en[3:6]], ] + [self.v2i[v] for v in en[6:]] + [
  24.                     self.v2i["<EOS>"], ])
  25.         self.x, self.y = np.array(self.x), np.array(self.y)
  26.         self.start_token = self.v2i["<GO>"]
  27.         self.end_token = self.v2i["<EOS>"]

  28.     def sample(self, n=64):
  29.         bi = np.random.randint(0, len(self.x), size=n)
  30.         bx, by = self.x[bi], self.y[bi]
  31.         decoder_len = np.full((len(bx),), by.shape[1] - 1, dtype=np.int32)
  32.         return bx, by, decoder_len

  33.     def idx2str(self, idx):
  34.         x = []
  35.         for i in idx:
  36.             x.append(self.i2v[i])
  37.             if i == self.end_token:
  38.                 break
  39.         return "".join(x)

  40.     @property
  41.     def num_word(self):
  42.         return len(self.vocab)
复制代码


三、模型构建
  1. class seq2seq(keras.Model):
  2.     def __init__(self, source_dict_total_words, source_embedding_size, encoder_num_layers, encoder_rnn_size,
  3.                 target_dict_total_words, target_embedding_size, decoder_rnn_size, start_token, batch_size,
  4.                 end_token, target_size):
  5.         super(seq2seq, self).__init__()
  6.         '''
  7.         encoder参数说明
  8.         --source_dict_total_words:source字典的总单词个数
  9.         --source_embedding_size:souce压缩的长度
  10.         --encoder_num_layers:encoder堆叠的rnn cell数量
  11.         --encoder_rnn_size:encoder中RNN单元的隐层结点数量
  12.         decoder参数说明
  13.         --target_dict_total_words:target字典的总单词个数
  14.         --target_embedding_size:target压缩的长度:
  15.         --decoder_num_layers:decoder堆叠的rnn cell数量
  16.         --decoder_rnn_size:decoder中RNN单元的隐层结点数量
  17.         --target_size:target中句子的长度
  18.         其他参数说明
  19.         --start_token:decoder输入的开始标志<GO>在target字典中的对应数字编号
  20.         --end_token:decoder输入的结束标志<EOS>在target字典中的对应数字编号
  21.         --batch_size:数据的batch_size
  22.         '''
  23.         
  24.         self.source_dict_total_words = source_dict_total_words
  25.         self.source_embedding_size = source_embedding_size
  26.         self.encoder_num_layers = encoder_num_layers
  27.         self.encoder_rnn_size = encoder_rnn_size
  28.         self.target_dict_total_words = target_dict_total_words
  29.         self.target_embedding_size = target_embedding_size
  30.         self.decoder_rnn_size = decoder_rnn_size
  31.         self.start_token = start_token
  32.         self.batch_size = batch_size
  33.         self.end_token = end_token
  34.         self.target_size = target_size
  35.         
  36.         self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits = True)
  37.         self.optimzer = optimizers.Adam(lr = 1e-2)
  38.         self.seq_len = tf.fill([self.batch_size], self.target_size-1)
  39.         
  40.    
  41.    
  42.    
  43.         #######################Encoder##################################
  44.         #1.embedding
  45.         self.encoder_embedding = layers.Embedding(self.source_dict_total_words, self.source_embedding_size,
  46.                                                   embeddings_initializer = tf.initializers.RandomNormal(0., 0.1))
  47.         #2.单层或多层rnn
  48.         self.encoder_rnn_cells = [layers.LSTMCell(self.encoder_rnn_size, dropout = 0.5) for _ in range(self.encoder_num_layers)]
  49.         self.encoder_stacked_lstm = layers.StackedRNNCells(self.encoder_rnn_cells)
  50.         self.encoder_rnn = layers.RNN(self.encoder_stacked_lstm, return_state = True, return_sequences = True)
  51.         #######################Decoder##################################  
  52.         #1.embedding
  53.         self.decoder_embedding = layers.Embedding(self.target_dict_total_words, self.target_embedding_size,
  54.                                                   embeddings_initializer = tf.initializers.RandomNormal(0., 0.1))
  55.         #2.构造Decoder中的rnn单元
  56.         self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.decoder_rnn_size)
  57.         #3.构造Decoder中的dense单元
  58.         self.decoder_dense_layer = layers.Dense(self.target_dict_total_words,
  59.                                                 kernel_initializer = tf.compat.v1.truncated_normal_initializer(mean = 0.0, stddev = 0.1))
  60.         #3.train
  61.         self.decoder_sampler = tfa.seq2seq.TrainingSampler()
  62.         self.training_decoder = tfa.seq2seq.BasicDecoder(cell = self.decoder_rnn_cell, sampler = self.decoder_sampler,
  63.                                                          output_layer = self.decoder_dense_layer)
  64.         #4.predict     
  65.         self.sampler = tfa.seq2seq.GreedyEmbeddingSampler()
  66.         self.predicting_decoder = tfa.seq2seq.BasicDecoder(cell = self.decoder_rnn_cell, sampler = self.sampler,
  67.                                                            output_layer = self.decoder_dense_layer)
  68.         
  69.         
  70.         
  71.     def encode(self, source):
  72.         embedded = self.encoder_embedding(source)
  73.         #init_s = [tf.zeros((source.shape[0], self.encoder_rnn_size)), tf.zeros((source.shape[0], self.encoder_rnn_size))]
  74.         res_list = self.encoder_rnn(embedded)
  75.         encoder_hidden = res_list[1][0]
  76.         encoder_state = res_list[1][1]
  77.         #print(encoder_hidden.shape, encoder_state.shape)
  78.         return [encoder_hidden, encoder_state]
  79.    
  80.     def train(self, source, target):
  81.         state = self.encode(source)
  82.         decoder_input = target[:, :-1]   #ignore <EOS>
  83.         decoder_embeding_input = self.decoder_embedding(decoder_input)
  84.         
  85.         output, _, _ = self.training_decoder(decoder_embeding_input, initial_state = state, sequence_length = self.seq_len)
  86.         return output.rnn_output
  87.    
  88.    
  89.     def predict(self, source):
  90.         initial_state = self.encode(source)
  91.         done, inputs, state = self.predicting_decoder.initialize(
  92.             self.decoder_embedding.variables[0],
  93.             start_tokens = tf.fill([self.batch_size, ], self.start_token),
  94.             end_token = self.end_token,
  95.             initial_state = initial_state,
  96.         )

  97.         pred_id = np.zeros((self.batch_size, self.target_size), dtype = np.int32)
  98.         for time in range(self.target_size):
  99.             output, state, inputs, done = self.predicting_decoder.step(
  100.                 time = time, inputs = inputs, state = state, training = False)
  101.             pred_id[:, time] = output.sample_id
  102.         return pred_id
  103.    
  104.    
  105.     def step(self, source, target):
  106.         with tf.GradientTape() as tape:
  107.             logits = self.train(source, target)
  108.             dec_out = target[:, 1:]  # ignore <GO>
  109.             loss = self.cross_entropy(dec_out, logits)
  110.         grads = tape.gradient(loss, self.trainable_variables)
  111.         self.optimzer.apply_gradients(zip(grads, self.trainable_variables))
  112.         return loss.numpy()
复制代码


四、数据实验
  1. epochs = 200
  2. batch_size = 248

  3. data = DateData(4000)
  4. print("1.Chinese time order: yy/mm/dd ", data.date_cn[:3], "\n2.English time order: dd/M/yyyy ", data.date_en[:3])
  5. print("3.vocabularies: \n", data.vocab)
  6. print("4.x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
  7.       "\n5.y index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))

  8. train_db = tf.data.Dataset.from_tensor_slices((np.array(data.x), np.array(data.y)))
  9. train_db = train_db.batch(batch_size, drop_remainder=True)

  10. optimizer = optimizers.Adam(lr = 1e-2)
  11. model = seq2seq(source_dict_total_words = data.num_word, source_embedding_size = 16, encoder_num_layers = 1, encoder_rnn_size = 32,
  12.                 target_dict_total_words = data.num_word, target_embedding_size = 16, decoder_rnn_size = 32,
  13.                 start_token = data.start_token, batch_size = batch_size,
  14.                 end_token = data.end_token, target_size = 11)#target_size是target单个句子的长度,包括<GO>和<EOS>

  15. for epoch in range(epochs):
  16.     for step, (source, target) in enumerate(train_db):
  17.         loss = model.step(source, target)
  18.    
  19.         if step % 5 == 0:
  20.             target = data.idx2str(np.array(target[0, 1:-1]))
  21.             pred = model.predict(source[0:1])
  22.             res = data.idx2str(pred[0])
  23.             src = data.idx2str(np.array(source[0]))
  24.             print(
  25.                 "epoch: ", epoch,
  26.                 "step:", step,
  27.                 "| loss: %.3f" % loss,
  28.                 "| input: ", src,
  29.                 "| target: ", target,
  30.                 "| inference: ", res,
  31.             )
复制代码




代码参考:https://mofanpy.com/tutorials/machine-learning/nlp/seq2seq/
代码参考自莫烦的seq2seq项目。在原有的基础上加上自己的理解。

本帖被以下淘专辑推荐:

小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

 楼主| 发表于 2021-4-21 20:43:57 | 显示全部楼层
小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2025-5-20 05:45

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表