|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2021-11-14 15:18 编辑
一、数据预处理+模型训练
- import os
- #os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
- train_data_path = "E:/.../1.NLP/地址识别项目/data/train.conll" # 训练数据
- test_data_path = "E:/.../1.NLP/地址识别项目/data/dev.conll" # 测试数据
- #创建打标的标签索引
- import pandas as pd
- store = pd.read_table(r"E:\...\1.NLP\地址识别项目\data\mytag.dic", header = None)
- store.loc[0, 0] = "O"
- store.loc[24, 0] = "B-prov"
- store1 = store.to_dict()
- idx2label = store1[0]
- label2idx = {idx: label for label, idx in idx2label.items()}
- CLASS_NUM = len(label2idx)
- #读取文本数据和标注数据
- from tensorflow.keras.preprocessing import sequence
- import numpy as np
- MAX_LEN = 55
- # 读取训练语料
- def read_corpus(corpus_path, label2idx):
- datas, labels = [], []
- with open(corpus_path, encoding='utf-8') as fr:
- lines = fr.readlines()
- sent_, tag_ = [], []
- for line in lines:
- if line != '\n':
- char, label = line.strip().split()
- sent_.append(char)
- tag_.append(label)
- else:
- sent_ids = [char for char in sent_]
- tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
- datas.append(sent_ids)
- labels.append(tag_ids)
- sent_, tag_ = [], []
- return datas, labels
- # 加载训练集
- train_datas, train_labels = read_corpus(train_data_path, label2idx)
- # 加载测试集
- test_datas, test_labels = read_corpus(test_data_path, label2idx)
- # In[1]
- from transformers import BertTokenizer
- # 使用bert的tokenizer将文字转化成数字。
- PRETRAINED_MODEL_NAME = r"D:\bert_model\bert-base-chinese" # 指定为中文
- tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
- def pad(train_datas):
- for i in range(len(train_datas)):
- #train_datas[i].insert(0, "[CLS]")
- #train_datas[i].append('[SEP]')
- if len(train_datas[i]) < MAX_LEN:
- train_datas[i].extend(['[PAD]'] * (MAX_LEN - len(train_datas[i])))
- elif len(train_datas[i]) > MAX_LEN:
- train_datas[i] = train_datas[i][0:MAX_LEN]
- return train_datas
- train_datas_pad = np.array(pad(train_datas))
- test_datas_pad = np.array(pad(test_datas))
- def txt2bert(train_datas_pad):
- train_data_bert = []
- for i in range(len(train_datas_pad)):
- store = []
- for j in range(len(train_datas_pad[i])):
- store.append(tokenizer.convert_tokens_to_ids(train_datas_pad[i][j]))
- train_data_bert.append(store)
- return train_data_bert
- train_data_bert = np.array(txt2bert(train_datas_pad))
- test_data_bert = np.array(txt2bert(test_datas_pad))
- print(train_data_bert.shape)
- print(test_data_bert.shape)
- print(tokenizer.convert_ids_to_tokens(train_data_bert[0]))
- # In[1]
- def pad_test(train_labels):
- for i in range(len(train_labels)):
- #train_labels[i].insert(0, 57)
- #train_labels[i].append(58)
- if len(train_labels[i]) < MAX_LEN:
- train_labels[i].extend([0] * (MAX_LEN - len(train_labels[i])))
- elif len(train_labels[i]) > MAX_LEN:
- train_labels[i] = train_labels[i][0:MAX_LEN]
- return train_labels
-
- train_labels_pad = np.array(pad_test(train_labels))
- test_labels_pad = np.array(pad_test(test_labels))
- print(train_labels_pad.shape)
- print(test_labels_pad.shape)
- print([idx2label[char] for char in train_labels_pad[0]])
- # In[3]
- import tensorflow as tf
- def preprocess(train_ids, train_labels):
- x = tf.cast(train_ids, dtype = tf.int64)
- y = tf.cast(train_labels, dtype = tf.int64)
- return x, y
- train_db = tf.data.Dataset.from_tensor_slices((train_data_bert, train_labels_pad))
- train_db = train_db.map(preprocess).batch(32)
- test_db = tf.data.Dataset.from_tensor_slices((test_data_bert, test_labels_pad))
- test_db = test_db.map(preprocess).batch(32)
- # In[3]
- from transformers import TFBertForTokenClassification
- from tensorflow import keras
- model = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = CLASS_NUM)
- model.layers[-1].activation = tf.keras.activations.softmax
- optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
- loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
- model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
- model.summary()
- history = model.fit(x = train_data_bert, y = train_labels_pad, validation_split=0.1,batch_size=32, epochs=10).history
- # In[1]
- #print(tf.test.is_gpu_available())
复制代码
- #x_test, y_test = next(iter(train_db))
- #preds = model(input_ids = x_test, attention_mask = mask_test)
- x_test, y_test = next(iter(train_db))
- mask_test = np.int64(x_test) >= 0
- mask_test = tf.convert_to_tensor(mask_test)
- preds = model(input_ids = x_test, attention_mask = mask_test)
-
- prob = tf.nn.softmax(preds.logits, axis=-1)
- pred = tf.argmax(prob, axis=-1)
- pred = tf.cast(pred, dtype=tf.int64)
- #pred = tf.reshape(pred, (1, -1))
- store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
- print([idx2label[char] for char in np.array(pred[1])][:int(store)])
- print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
复制代码
二、模型效果查看
- #x_test, y_test = next(iter(train_db))
- #preds = model(input_ids = x_test, attention_mask = mask_test)
- x_test, y_test = next(iter(train_db))
- mask_test = np.int64(x_test) >= 0
- mask_test = tf.convert_to_tensor(mask_test)
- preds = model(input_ids = x_test, attention_mask = mask_test)
-
- prob = tf.nn.softmax(preds.logits, axis=-1)
- pred = tf.argmax(prob, axis=-1)
- pred = tf.cast(pred, dtype=tf.int64)
- #pred = tf.reshape(pred, (1, -1))
- store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
- print([idx2label[char] for char in np.array(pred[1])][:int(store)])
- print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
复制代码
三、模型结果评价指标
- test_data_bert
- mask_test = np.int64(test_data_bert) >= 0
- mask_test = tf.convert_to_tensor(mask_test)
- preds = model(input_ids = test_data_bert, attention_mask = mask_test)
- prob = tf.nn.softmax(preds.logits, axis = -1)
- pred = tf.argmax(prob, axis = -1)
- pred = tf.cast(pred, dtype = tf.int64)
- mark_labels = test_labels_pad > 0
- total_num = tf.reduce_sum(tf.cast(mark_labels, dtype = tf.int32))
- print(total_num)
- equal_labels = tf.equal(pred, test_labels_pad)
- false_labels = test_labels_pad < 0
- true_pred = tf.reduce_sum(tf.cast(tf.where(mark_labels, equal_labels, false_labels), dtype = tf.int32))
- print(true_pred)
- print(true_pred/total_num)
- store = tf.reduce_sum(tf.cast(tf.equal(pred, test_labels_pad), dtype=tf.int32))
- print(store / (1970*55))
- #x_test, y_test = next(iter(train_db))
- #preds = model(input_ids = x_test, attention_mask = mask_test)
- x_test, y_test = next(iter(train_db))
- mask_test = np.int64(x_test) >= 0
- mask_test = tf.convert_to_tensor(mask_test)
- preds = model(input_ids = x_test, attention_mask = mask_test)
-
- prob = tf.nn.softmax(preds.logits, axis=-1)
- pred = tf.argmax(prob, axis=-1)
- pred = tf.cast(pred, dtype=tf.int64)
- #pred = tf.reshape(pred, (1, -1))
- store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
- print([idx2label[char] for char in np.array(pred[1])][:int(store)])
- print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
- pred_labels = []
- mask_test = np.int64(test_data_bert) >= 0
- mask_test = tf.convert_to_tensor(mask_test)
- pred = model(input_ids = test_data_bert, attention_mask = mask_test)
- prob = tf.nn.softmax(pred.logits, axis=-1)
- pred = tf.argmax(prob, axis=-1)
- pred = tf.cast(pred, dtype=tf.int64)
- print(pred)
- from sklearn.metrics import f1_score
- f1_score(test_labels_pad.reshape(-1), np.array(pred).reshape(-1), average='macro')
- from sklearn.metrics import f1_score
- f1_score(test_labels_pad.reshape(-1), y_predict.reshape(-1), average='micro')
- from sklearn.metrics import classification_report
- temp = classification_report(test_labels_pad.reshape(-1), y_predict.reshape(-1))
- print(temp)
-
复制代码 |
|