鱼C论坛

 找回密码
 立即注册
查看: 180|回复: 1

[学习笔记] python实现BERT【tensorflow2.4.0】【命名实体识别】

[复制链接]
发表于 2021-11-14 15:11:27 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 糖逗 于 2021-11-14 15:18 编辑

一、数据预处理+模型训练
  1. import os
  2. #os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  3. train_data_path = "E:/.../1.NLP/地址识别项目/data/train.conll" # 训练数据
  4. test_data_path = "E:/.../1.NLP/地址识别项目/data/dev.conll" # 测试数据



  5. #创建打标的标签索引
  6. import pandas as pd
  7. store = pd.read_table(r"E:\...\1.NLP\地址识别项目\data\mytag.dic", header = None)
  8. store.loc[0, 0] = "O"
  9. store.loc[24, 0] = "B-prov"
  10. store1 = store.to_dict()
  11. idx2label = store1[0]
  12. label2idx = {idx: label for label, idx in idx2label.items()}
  13. CLASS_NUM = len(label2idx)


  14. #读取文本数据和标注数据
  15. from tensorflow.keras.preprocessing import sequence
  16. import numpy as np
  17. MAX_LEN = 55
  18. # 读取训练语料
  19. def read_corpus(corpus_path, label2idx):
  20.     datas, labels = [], []
  21.     with open(corpus_path, encoding='utf-8') as fr:
  22.         lines = fr.readlines()
  23.     sent_, tag_ = [], []
  24.     for line in lines:
  25.         if line != '\n':
  26.             char, label = line.strip().split()
  27.             sent_.append(char)
  28.             tag_.append(label)
  29.         else:
  30.             sent_ids = [char for char in sent_]
  31.             tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
  32.             datas.append(sent_ids)
  33.             labels.append(tag_ids)
  34.             sent_, tag_ = [], []
  35.     return datas, labels

  36. # 加载训练集
  37. train_datas, train_labels = read_corpus(train_data_path, label2idx)
  38. # 加载测试集
  39. test_datas, test_labels = read_corpus(test_data_path, label2idx)

  40. # In[1]
  41. from transformers import BertTokenizer

  42. # 使用bert的tokenizer将文字转化成数字。
  43. PRETRAINED_MODEL_NAME = r"D:\bert_model\bert-base-chinese"  # 指定为中文
  44. tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

  45. def pad(train_datas):
  46.     for i in range(len(train_datas)):
  47.         #train_datas[i].insert(0, "[CLS]")
  48.         #train_datas[i].append('[SEP]')
  49.         if len(train_datas[i]) < MAX_LEN:
  50.             train_datas[i].extend(['[PAD]'] * (MAX_LEN - len(train_datas[i])))
  51.         elif len(train_datas[i]) > MAX_LEN:
  52.             train_datas[i] = train_datas[i][0:MAX_LEN]
  53.     return train_datas

  54. train_datas_pad = np.array(pad(train_datas))
  55. test_datas_pad = np.array(pad(test_datas))

  56. def txt2bert(train_datas_pad):
  57.     train_data_bert = []
  58.     for i in range(len(train_datas_pad)):
  59.         store = []
  60.         for j in range(len(train_datas_pad[i])):
  61.             store.append(tokenizer.convert_tokens_to_ids(train_datas_pad[i][j]))
  62.         train_data_bert.append(store)
  63.     return train_data_bert

  64. train_data_bert = np.array(txt2bert(train_datas_pad))
  65. test_data_bert = np.array(txt2bert(test_datas_pad))
  66. print(train_data_bert.shape)
  67. print(test_data_bert.shape)
  68. print(tokenizer.convert_ids_to_tokens(train_data_bert[0]))
  69. # In[1]
  70. def pad_test(train_labels):
  71.     for i in range(len(train_labels)):
  72.         #train_labels[i].insert(0, 57)
  73.         #train_labels[i].append(58)
  74.         if len(train_labels[i]) < MAX_LEN:
  75.             train_labels[i].extend([0] * (MAX_LEN - len(train_labels[i])))
  76.         elif len(train_labels[i]) > MAX_LEN:
  77.             train_labels[i] = train_labels[i][0:MAX_LEN]
  78.     return train_labels
  79.    
  80. train_labels_pad = np.array(pad_test(train_labels))
  81. test_labels_pad = np.array(pad_test(test_labels))
  82. print(train_labels_pad.shape)
  83. print(test_labels_pad.shape)
  84. print([idx2label[char] for char in train_labels_pad[0]])


  85. # In[3]
  86. import tensorflow as tf

  87. def preprocess(train_ids, train_labels):
  88.     x = tf.cast(train_ids, dtype = tf.int64)
  89.     y = tf.cast(train_labels, dtype = tf.int64)
  90.     return x, y



  91. train_db = tf.data.Dataset.from_tensor_slices((train_data_bert, train_labels_pad))
  92. train_db = train_db.map(preprocess).batch(32)


  93. test_db = tf.data.Dataset.from_tensor_slices((test_data_bert, test_labels_pad))
  94. test_db = test_db.map(preprocess).batch(32)
  95. # In[3]
  96. from transformers import TFBertForTokenClassification
  97. from tensorflow import keras

  98. model = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = CLASS_NUM)



  99. model.layers[-1].activation = tf.keras.activations.softmax

  100. optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
  101. loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  102. metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

  103. model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

  104. model.summary()


  105. history = model.fit(x = train_data_bert, y = train_labels_pad, validation_split=0.1,batch_size=32, epochs=10).history

  106. # In[1]

  107. #print(tf.test.is_gpu_available())
复制代码


  1. #x_test, y_test =  next(iter(train_db))
  2. #preds = model(input_ids = x_test, attention_mask = mask_test)

  3. x_test, y_test =  next(iter(train_db))
  4. mask_test = np.int64(x_test) >= 0
  5. mask_test = tf.convert_to_tensor(mask_test)
  6. preds = model(input_ids = x_test, attention_mask = mask_test)
  7.         
  8. prob = tf.nn.softmax(preds.logits, axis=-1)
  9. pred = tf.argmax(prob, axis=-1)
  10. pred = tf.cast(pred, dtype=tf.int64)
  11. #pred = tf.reshape(pred, (1, -1))
  12. store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
  13. print([idx2label[char] for char in np.array(pred[1])][:int(store)])

  14. print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])                           
复制代码



二、模型效果查看
  1. #x_test, y_test =  next(iter(train_db))
  2. #preds = model(input_ids = x_test, attention_mask = mask_test)

  3. x_test, y_test =  next(iter(train_db))
  4. mask_test = np.int64(x_test) >= 0
  5. mask_test = tf.convert_to_tensor(mask_test)
  6. preds = model(input_ids = x_test, attention_mask = mask_test)
  7.         
  8. prob = tf.nn.softmax(preds.logits, axis=-1)
  9. pred = tf.argmax(prob, axis=-1)
  10. pred = tf.cast(pred, dtype=tf.int64)
  11. #pred = tf.reshape(pred, (1, -1))
  12. store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
  13. print([idx2label[char] for char in np.array(pred[1])][:int(store)])

  14. print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])                           
复制代码



三、模型结果评价指标
  1. test_data_bert
  2. mask_test = np.int64(test_data_bert) >= 0
  3. mask_test = tf.convert_to_tensor(mask_test)
  4. preds = model(input_ids = test_data_bert, attention_mask = mask_test)

  5. prob = tf.nn.softmax(preds.logits, axis = -1)
  6. pred = tf.argmax(prob, axis = -1)
  7. pred = tf.cast(pred, dtype = tf.int64)



  8. mark_labels = test_labels_pad > 0
  9. total_num = tf.reduce_sum(tf.cast(mark_labels, dtype = tf.int32))
  10. print(total_num)
  11. equal_labels = tf.equal(pred, test_labels_pad)
  12. false_labels = test_labels_pad < 0
  13. true_pred = tf.reduce_sum(tf.cast(tf.where(mark_labels, equal_labels, false_labels), dtype = tf.int32))
  14. print(true_pred)
  15. print(true_pred/total_num)
  16. store = tf.reduce_sum(tf.cast(tf.equal(pred, test_labels_pad),  dtype=tf.int32))
  17. print(store / (1970*55))
  18. #x_test, y_test =  next(iter(train_db))
  19. #preds = model(input_ids = x_test, attention_mask = mask_test)

  20. x_test, y_test =  next(iter(train_db))
  21. mask_test = np.int64(x_test) >= 0
  22. mask_test = tf.convert_to_tensor(mask_test)
  23. preds = model(input_ids = x_test, attention_mask = mask_test)
  24.         
  25. prob = tf.nn.softmax(preds.logits, axis=-1)
  26. pred = tf.argmax(prob, axis=-1)
  27. pred = tf.cast(pred, dtype=tf.int64)
  28. #pred = tf.reshape(pred, (1, -1))
  29. store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
  30. print([idx2label[char] for char in np.array(pred[1])][:int(store)])

  31. print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])      
  32. pred_labels = []
  33. mask_test = np.int64(test_data_bert) >= 0
  34. mask_test = tf.convert_to_tensor(mask_test)
  35. pred = model(input_ids = test_data_bert, attention_mask = mask_test)
  36. prob = tf.nn.softmax(pred.logits, axis=-1)
  37. pred = tf.argmax(prob, axis=-1)
  38. pred = tf.cast(pred, dtype=tf.int64)
  39. print(pred)

  40. from sklearn.metrics import f1_score
  41. f1_score(test_labels_pad.reshape(-1), np.array(pred).reshape(-1), average='macro')

  42. from sklearn.metrics import f1_score
  43. f1_score(test_labels_pad.reshape(-1), y_predict.reshape(-1), average='micro')

  44. from sklearn.metrics import classification_report
  45. temp = classification_report(test_labels_pad.reshape(-1), y_predict.reshape(-1))
  46. print(temp)
  47.                         
复制代码

本帖被以下淘专辑推荐:

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
 楼主| 发表于 2021-11-14 15:12:44 | 显示全部楼层
任务背景+数据获取看上一篇:https://fishc.com.cn/forum.php?m ... 05523&ctid=1732
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1

GMT+8, 2022-5-24 16:42

Powered by Discuz! X3.4

Copyright © 2001-2021, Tencent Cloud.

快速回复 返回顶部 返回列表