糖逗 发表于 2021-11-14 15:11:27

python实现BERT【tensorflow2.4.0】【命名实体识别】

本帖最后由 糖逗 于 2021-11-14 15:18 编辑

一、数据预处理+模型训练
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
train_data_path = "E:/.../1.NLP/地址识别项目/data/train.conll" # 训练数据
test_data_path = "E:/.../1.NLP/地址识别项目/data/dev.conll" # 测试数据



#创建打标的标签索引
import pandas as pd
store = pd.read_table(r"E:\...\1.NLP\地址识别项目\data\mytag.dic", header = None)
store.loc = "O"
store.loc = "B-prov"
store1 = store.to_dict()
idx2label = store1
label2idx = {idx: label for label, idx in idx2label.items()}
CLASS_NUM = len(label2idx)


#读取文本数据和标注数据
from tensorflow.keras.preprocessing import sequence
import numpy as np
MAX_LEN = 55
# 读取训练语料
def read_corpus(corpus_path, label2idx):
    datas, labels = [], []
    with open(corpus_path, encoding='utf-8') as fr:
      lines = fr.readlines()
    sent_, tag_ = [], []
    for line in lines:
      if line != '\n':
            char, label = line.strip().split()
            sent_.append(char)
            tag_.append(label)
      else:
            sent_ids =
            tag_ids = if label in label2idx else 0 for label in tag_]
            datas.append(sent_ids)
            labels.append(tag_ids)
            sent_, tag_ = [], []
    return datas, labels

# 加载训练集
train_datas, train_labels = read_corpus(train_data_path, label2idx)
# 加载测试集
test_datas, test_labels = read_corpus(test_data_path, label2idx)

# In
from transformers import BertTokenizer

# 使用bert的tokenizer将文字转化成数字。
PRETRAINED_MODEL_NAME = r"D:\bert_model\bert-base-chinese"# 指定为中文
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def pad(train_datas):
    for i in range(len(train_datas)):
      #train_datas.insert(0, "")
      #train_datas.append('')
      if len(train_datas) < MAX_LEN:
            train_datas.extend([''] * (MAX_LEN - len(train_datas)))
      elif len(train_datas) > MAX_LEN:
            train_datas = train_datas
    return train_datas

train_datas_pad = np.array(pad(train_datas))
test_datas_pad = np.array(pad(test_datas))

def txt2bert(train_datas_pad):
    train_data_bert = []
    for i in range(len(train_datas_pad)):
      store = []
      for j in range(len(train_datas_pad)):
            store.append(tokenizer.convert_tokens_to_ids(train_datas_pad))
      train_data_bert.append(store)
    return train_data_bert

train_data_bert = np.array(txt2bert(train_datas_pad))
test_data_bert = np.array(txt2bert(test_datas_pad))
print(train_data_bert.shape)
print(test_data_bert.shape)
print(tokenizer.convert_ids_to_tokens(train_data_bert))
# In
def pad_test(train_labels):
    for i in range(len(train_labels)):
      #train_labels.insert(0, 57)
      #train_labels.append(58)
      if len(train_labels) < MAX_LEN:
            train_labels.extend( * (MAX_LEN - len(train_labels)))
      elif len(train_labels) > MAX_LEN:
            train_labels = train_labels
    return train_labels
   
train_labels_pad = np.array(pad_test(train_labels))
test_labels_pad = np.array(pad_test(test_labels))
print(train_labels_pad.shape)
print(test_labels_pad.shape)
print( for char in train_labels_pad])


# In
import tensorflow as tf

def preprocess(train_ids, train_labels):
    x = tf.cast(train_ids, dtype = tf.int64)
    y = tf.cast(train_labels, dtype = tf.int64)
    return x, y



train_db = tf.data.Dataset.from_tensor_slices((train_data_bert, train_labels_pad))
train_db = train_db.map(preprocess).batch(32)


test_db = tf.data.Dataset.from_tensor_slices((test_data_bert, test_labels_pad))
test_db = test_db.map(preprocess).batch(32)
# In
from transformers import TFBertForTokenClassification
from tensorflow import keras

model = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = CLASS_NUM)



model.layers[-1].activation = tf.keras.activations.softmax

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=)

model.summary()


history = model.fit(x = train_data_bert, y = train_labels_pad, validation_split=0.1,batch_size=32, epochs=10).history

# In

#print(tf.test.is_gpu_available())


#x_test, y_test =next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
      
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test >0 , dtype = tf.int32))
print( for char in np.array(pred)][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test)[:int(store)])                           


二、模型效果查看
#x_test, y_test =next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
      
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test >0 , dtype = tf.int32))
print( for char in np.array(pred)][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test)[:int(store)])                           


三、模型结果评价指标
test_data_bert
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = test_data_bert, attention_mask = mask_test)

prob = tf.nn.softmax(preds.logits, axis = -1)
pred = tf.argmax(prob, axis = -1)
pred = tf.cast(pred, dtype = tf.int64)



mark_labels = test_labels_pad > 0
total_num = tf.reduce_sum(tf.cast(mark_labels, dtype = tf.int32))
print(total_num)
equal_labels = tf.equal(pred, test_labels_pad)
false_labels = test_labels_pad < 0
true_pred = tf.reduce_sum(tf.cast(tf.where(mark_labels, equal_labels, false_labels), dtype = tf.int32))
print(true_pred)
print(true_pred/total_num)
store = tf.reduce_sum(tf.cast(tf.equal(pred, test_labels_pad),dtype=tf.int32))
print(store / (1970*55))
#x_test, y_test =next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
      
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test >0 , dtype = tf.int32))
print( for char in np.array(pred)][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test)[:int(store)])      
pred_labels = []
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
pred = model(input_ids = test_data_bert, attention_mask = mask_test)
prob = tf.nn.softmax(pred.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
print(pred)

from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), np.array(pred).reshape(-1), average='macro')

from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), y_predict.reshape(-1), average='micro')

from sklearn.metrics import classification_report
temp = classification_report(test_labels_pad.reshape(-1), y_predict.reshape(-1))
print(temp)
                        

糖逗 发表于 2021-11-14 15:12:44

任务背景+数据获取看上一篇:https://fishc.com.cn/forum.php?mod=viewthread&tid=205523&ctid=1732
页: [1]
查看完整版本: python实现BERT【tensorflow2.4.0】【命名实体识别】