鱼C论坛

 找回密码
 立即注册
查看: 2254|回复: 1

[技术交流] python实现BERT【tensorflow2.4.0】【命名实体识别】

[复制链接]
发表于 2021-11-14 15:11:27 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 糖逗 于 2021-11-14 15:18 编辑

一、数据预处理+模型训练
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
train_data_path = "E:/.../1.NLP/地址识别项目/data/train.conll" # 训练数据
test_data_path = "E:/.../1.NLP/地址识别项目/data/dev.conll" # 测试数据



#创建打标的标签索引
import pandas as pd
store = pd.read_table(r"E:\...\1.NLP\地址识别项目\data\mytag.dic", header = None)
store.loc[0, 0] = "O"
store.loc[24, 0] = "B-prov"
store1 = store.to_dict()
idx2label = store1[0]
label2idx = {idx: label for label, idx in idx2label.items()}
CLASS_NUM = len(label2idx)


#读取文本数据和标注数据
from tensorflow.keras.preprocessing import sequence
import numpy as np
MAX_LEN = 55
# 读取训练语料
def read_corpus(corpus_path, label2idx):
    datas, labels = [], []
    with open(corpus_path, encoding='utf-8') as fr:
        lines = fr.readlines()
    sent_, tag_ = [], []
    for line in lines:
        if line != '\n':
            char, label = line.strip().split()
            sent_.append(char)
            tag_.append(label)
        else:
            sent_ids = [char for char in sent_]
            tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
            datas.append(sent_ids)
            labels.append(tag_ids)
            sent_, tag_ = [], []
    return datas, labels

# 加载训练集
train_datas, train_labels = read_corpus(train_data_path, label2idx)
# 加载测试集
test_datas, test_labels = read_corpus(test_data_path, label2idx)

# In[1]
from transformers import BertTokenizer

# 使用bert的tokenizer将文字转化成数字。
PRETRAINED_MODEL_NAME = r"D:\bert_model\bert-base-chinese"  # 指定为中文
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def pad(train_datas):
    for i in range(len(train_datas)):
        #train_datas[i].insert(0, "[CLS]")
        #train_datas[i].append('[SEP]')
        if len(train_datas[i]) < MAX_LEN:
            train_datas[i].extend(['[PAD]'] * (MAX_LEN - len(train_datas[i])))
        elif len(train_datas[i]) > MAX_LEN:
            train_datas[i] = train_datas[i][0:MAX_LEN]
    return train_datas

train_datas_pad = np.array(pad(train_datas))
test_datas_pad = np.array(pad(test_datas))

def txt2bert(train_datas_pad):
    train_data_bert = []
    for i in range(len(train_datas_pad)):
        store = []
        for j in range(len(train_datas_pad[i])):
            store.append(tokenizer.convert_tokens_to_ids(train_datas_pad[i][j]))
        train_data_bert.append(store)
    return train_data_bert

train_data_bert = np.array(txt2bert(train_datas_pad))
test_data_bert = np.array(txt2bert(test_datas_pad))
print(train_data_bert.shape)
print(test_data_bert.shape)
print(tokenizer.convert_ids_to_tokens(train_data_bert[0]))
# In[1]
def pad_test(train_labels):
    for i in range(len(train_labels)):
        #train_labels[i].insert(0, 57)
        #train_labels[i].append(58)
        if len(train_labels[i]) < MAX_LEN:
            train_labels[i].extend([0] * (MAX_LEN - len(train_labels[i])))
        elif len(train_labels[i]) > MAX_LEN:
            train_labels[i] = train_labels[i][0:MAX_LEN]
    return train_labels
    
train_labels_pad = np.array(pad_test(train_labels))
test_labels_pad = np.array(pad_test(test_labels)) 
print(train_labels_pad.shape)
print(test_labels_pad.shape)
print([idx2label[char] for char in train_labels_pad[0]])


# In[3]
import tensorflow as tf

def preprocess(train_ids, train_labels):
    x = tf.cast(train_ids, dtype = tf.int64)
    y = tf.cast(train_labels, dtype = tf.int64)
    return x, y



train_db = tf.data.Dataset.from_tensor_slices((train_data_bert, train_labels_pad))
train_db = train_db.map(preprocess).batch(32)


test_db = tf.data.Dataset.from_tensor_slices((test_data_bert, test_labels_pad))
test_db = test_db.map(preprocess).batch(32)
# In[3]
from transformers import TFBertForTokenClassification
from tensorflow import keras

model = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = CLASS_NUM)



model.layers[-1].activation = tf.keras.activations.softmax

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()


history = model.fit(x = train_data_bert, y = train_labels_pad, validation_split=0.1,batch_size=32, epochs=10).history

# In[1]

#print(tf.test.is_gpu_available())

#x_test, y_test =  next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =  next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
        
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])                           


二、模型效果查看
#x_test, y_test =  next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =  next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
        
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])                           


三、模型结果评价指标
test_data_bert
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = test_data_bert, attention_mask = mask_test)

prob = tf.nn.softmax(preds.logits, axis = -1)
pred = tf.argmax(prob, axis = -1)
pred = tf.cast(pred, dtype = tf.int64)



mark_labels = test_labels_pad > 0
total_num = tf.reduce_sum(tf.cast(mark_labels, dtype = tf.int32))
print(total_num)
equal_labels = tf.equal(pred, test_labels_pad)
false_labels = test_labels_pad < 0
true_pred = tf.reduce_sum(tf.cast(tf.where(mark_labels, equal_labels, false_labels), dtype = tf.int32))
print(true_pred)
print(true_pred/total_num)
store = tf.reduce_sum(tf.cast(tf.equal(pred, test_labels_pad),  dtype=tf.int32))
print(store / (1970*55))
#x_test, y_test =  next(iter(train_db))
#preds = model(input_ids = x_test, attention_mask = mask_test)

x_test, y_test =  next(iter(train_db))
mask_test = np.int64(x_test) >= 0
mask_test = tf.convert_to_tensor(mask_test)
preds = model(input_ids = x_test, attention_mask = mask_test)
        
prob = tf.nn.softmax(preds.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
#pred = tf.reshape(pred, (1, -1))
store = tf.reduce_sum(tf.cast(x_test[1] >0 , dtype = tf.int32))
print([idx2label[char] for char in np.array(pred[1])][:int(store)])

print(tokenizer.convert_ids_to_tokens(x_test[1])[:int(store)])      
pred_labels = []
mask_test = np.int64(test_data_bert) >= 0
mask_test = tf.convert_to_tensor(mask_test)
pred = model(input_ids = test_data_bert, attention_mask = mask_test)
prob = tf.nn.softmax(pred.logits, axis=-1)
pred = tf.argmax(prob, axis=-1)
pred = tf.cast(pred, dtype=tf.int64)
print(pred)

from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), np.array(pred).reshape(-1), average='macro')

from sklearn.metrics import f1_score
f1_score(test_labels_pad.reshape(-1), y_predict.reshape(-1), average='micro')

from sklearn.metrics import classification_report
temp = classification_report(test_labels_pad.reshape(-1), y_predict.reshape(-1))
print(temp)
                         

本帖被以下淘专辑推荐:

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

 楼主| 发表于 2021-11-14 15:12:44 | 显示全部楼层
任务背景+数据获取看上一篇:https://fishc.com.cn/forum.php?m ... 05523&ctid=1732
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-11-25 18:07

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表