|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2021-11-14 15:18 编辑
一、数据预处理+模型训练import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
train_data_path = "E:/.../1.NLP/地址识别项目/data/train.conll" # 训练数据
test_data_path = "E:/.../1.NLP/地址识别项目/data/dev.conll" # 测试数据
#创建打标的标签索引
import pandas as pd
store = pd.read_table(r"E:\...\1.NLP\地址识别项目\data\mytag.dic", header = None)
store.loc[0, 0] = "O"
store.loc[24, 0] = "B-prov"
store1 = store.to_dict()
idx2label = store1[0]
label2idx = {idx: label for label, idx in idx2label.items()}
CLASS_NUM = len(label2idx)
#读取文本数据和标注数据
from tensorflow.keras.preprocessing import sequence
import numpy as np
MAX_LEN = 55
# 读取训练语料
def read_corpus(corpus_path, label2idx):
datas, labels = [], []
with open(corpus_path, encoding='utf-8') as fr:
lines = fr.readlines()
sent_, tag_ = [], []
for line in lines:
if line != '\n':
char, label = line.strip().split()
sent_.append(char)
tag_.append(label)
else:
sent_ids = [char for char in sent_]
tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
datas.append(sent_ids)
labels.append(tag_ids)
sent_, tag_ = [], []
return datas, labels
# 加载训练集
train_datas, train_labels = read_corpus(train_data_path, label2idx)
# 加载测试集
test_datas, test_labels = read_corpus(test_data_path, label2idx)
# In[1]
from transformers import BertTokenizer
# 使用bert的tokenizer将文字转化成数字。
PRETRAINED_MODEL_NAME = r"D:\bert_model\bert-base-chinese" # 指定为中文
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
def pad(train_datas):
for i in range(len(train_datas)):
#train_datas[i].insert(0, "[CLS]")
#train_datas[i].append('[SEP]')
if len(train_datas[i]) < MAX_LEN:
train_datas[i].extend(['[PAD]'] * (MAX_LEN - len(train_datas[i])))
elif len(train_datas[i]) > MAX_LEN:
train_datas[i] = train_datas[i][0:MAX_LEN]
return train_datas
train_datas_pad = np.array(pad(train_datas))
test_datas_pad = np.array(pad(test_datas))
def txt2bert(train_datas_pad):
train_data_bert = []
for i in range(len(train_datas_pad)):
store = []
for j in range(len(train_datas_pad[i])):
store.append(tokenizer.convert_tokens_to_ids(train_datas_pad[i][j]))
train_data_bert.append(store)
return train_data_bert
train_data_bert = np.array(txt2bert(train_datas_pad))
test_data_bert = np.array(txt2bert(test_datas_pad))
print(train_data_bert.shape)
print(test_data_bert.shape)
print(tokenizer.convert_ids_to_tokens(train_data_bert[0]))
# In[1]
def pad_test(train_labels):
for i in range(len(train_labels)):
#train_labels[i].insert(0, 57)
#train_labels[i].append(58)
if len(train_labels[i]) < MAX_LEN:
train_labels[i].extend([0] * (MAX_LEN - len(train_labels[i])))
elif len(train_labels[i]) > MAX_LEN:
train_labels[i] = train_labels[i][0:MAX_LEN]
return train_labels
train_labels_pad = np.array(pad_test(train_labels))
test_labels_pad = np.array(pad_test(test_labels))
print(train_labels_pad.shape)
print(test_labels_pad.shape)
print([idx2label[char] for char in train_labels_pad[0]])
# In[3]
import tensorflow as tf
def preprocess(train_ids, train_labels):
x = tf.cast(train_ids, dtype = tf.int64)
y = tf.cast(train_labels, dtype = tf.int64)
return x, y
train_db = tf.data.Dataset.from_tensor_slices((train_data_bert, train_labels_pad))
train_db = train_db.map(preprocess).batch(32)
test_db = tf.data.Dataset.from_tensor_slices((test_data_bert, test_labels_pad))
test_db = test_db.map(preprocess).batch(32)
# In[3]
from transformers import TFBertForTokenClassification
from tensorflow import keras
model = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = CLASS_NUM)
model.layers[-1].activation = tf.keras.activations.softmax
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.summary()
history = model.fit(x = train_data_bert, y = train_labels_pad, validation_split=0.1,batch_size=32, epochs=10).history
# In[1]
#print(tf.test.is_gpu_available())
#x_test, y_test = next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)
x_test, y_test = next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])
print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
二、模型效果查看#x_test, y_test = next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)
x_test, y_test = next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])
print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
三、模型结果评价指标test_data_bert
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = test_data_bert, attention_mask = mask_test)
prob = tf.nn.softmax(preds.logits, axis = -1)
pred = tf.argmax(prob, axis = -1)
pred = tf.cast(pred, dtype = tf.int64)
mark_labels = test_labels_pad > 0
total_num = tf.reduce_sum(tf.cast(mark_labels, dtype = tf.int32))
print(total_num)
equal_labels = tf.equal(pred, test_labels_pad)
false_labels = test_labels_pad < 0
true_pred = tf.reduce_sum(tf.cast(tf.where(mark_labels, equal_labels, false_labels), dtype = tf.int32))
print(true_pred)
print(true_pred/total_num)
store = tf.reduce_sum(tf.cast(tf.equal(pred, test_labels_pad), dtype=tf.int32))
print(store / (1970*55))
#x_test, y_test = next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)
x_test, y_test = next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])
print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])
pred_labels = []
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
pred = model(input_ids = test_data_bert, attention_mask = mask_test)
prob = tf.nn.softmax(pred.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
print(pred)
from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), np.array(pred).reshape(-1), average='macro')
from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), y_predict.reshape(-1), average='micro')
from sklearn.metrics import classification_report
temp = classification_report(test_labels_pad.reshape(-1), y_predict.reshape(-1))
print(temp)
|
|