请各位大佬帮忙看下,是什么原因报错,以及怎么修正
代码:import os
import re
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
"""
设置超参数
"""
max_len = 256
vocab_size = 10000
embedding_size = 128
conv_filters = 128
conv_kernel_size = 5
lstm_units = 128
dropout_rate = 0.2
num_epochs = 5
batch_size = 16
learning_rate = 1e-2
sampling_strategy = {1:1000, 2:1000}
attention_num = 50
n_estimators = 2
"""
加载数据
"""
import pandas as pd
import numpy as np
file_path = 'C:/Users/admin/Desktop/文学评论/文学评论数据/csv版/'
def Read_csv(file_name, num):
data = pd.read_csv(file_path + file_name, encoding_errors="ignore", dtype='str')
reviews = data["内容"]
data['labels'] = num
labels = data['labels']
return reviews, labels
reviews_1, labels_1 = Read_csv('1分.csv', 1) # 26
reviews_2, labels_2 = Read_csv('2分.csv', 2) # 35
reviews_3, labels_3 = Read_csv('3分.csv', 3) # 388
reviews_4, labels_4 = Read_csv('4分.csv', 4) # 1655
reviews_5, labels_5 = Read_csv('5分.csv', 5) # 4432
"""
分词
"""
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 停用词
def get_stopword_list(file):
with open(file, 'r', encoding='utf-8') as f:
stopword_list =
return stopword_list
stopword_list = get_stopword_list('stopwords_hit.txt')
# 标点符号列表
import string
punctuation = set(string.punctuation)
ch_punctuation = set('"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。')
punctuation = punctuation | ch_punctuation
def remove_non_utf8_chars(text):
# 将字符串编码为字节对象,忽略无法编码的字符
encoded_text = text.encode('utf-8')
# 将字节对象解码回UTF-8格式的字符串
decoded_text = encoded_text.decode('utf-8')
return decoded_text
def solve_negative_words(sentence): # 处理否定词等
text = []
flag = 0
negative_words = ['不','没','好','很','最','太','才','还','都','上','去']#,'好','很','最','太','才','还','都','上','去'
for pos in range(len(sentence)):
word = sentence
if word in negative_words and pos <= len(sentence)-2 and sentence not in punctuation:
flag = 1
continue
if flag == 1 and word not in punctuation:
word = sentence + word
flag = 0
text.append(word)
return text
data_x = []
def get_data_cut(reviews):
reviews = [(ele if isinstance(ele, str) else str(ele)) for ele in reviews]
reviews =
reviews = # 去除字符中的数字
reviews = ',"",sent,flags=re.I) for sent in reviews] # 去除字符中的英文单词
data_cut =
stop_usual = ['\n',' ']
for words in data_cut:
temp = []
words = solve_negative_words(words)
for pos in range(len(words)):
word = words
if word not in stopword_list and word not in punctuation and word not in stop_usual:
temp.append(word)
data_x.append(temp)
return 0
for i in range(1,6):
get_data_cut(eval('reviews_' + str(i)))
"""
处理数据
"""
tokenizer = Tokenizer(num_words=vocab_size)
print(data_x[:5])
tokenizer.fit_on_texts(data_x) # 建立词表
data_ids = tokenizer.texts_to_sequences(data_x) # 将文本根据词汇表转换为一串数字
data_padded = pad_sequences(data_ids, maxlen=max_len, padding="post", truncating="post")
data_x = data_padded
labels = []
for i in range(1,6):
labels.extend(eval('labels_' + str(i)))
data_y = np.array(labels)
print(data_x[:5])
print(data_y[:5])
"""
划分数据集
"""
from sklearn.model_selection import (StratifiedShuffleSplit)
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, test_indices = next(splitter.split(data_x, data_y))
x_train, y_train = data_x, data_y
x_test, y_test = data_x, data_y # 1308
# 将训练集划分为训练集和验证集
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, valid_indices = next(splitter.split(x_train, y_train))
x_valid, y_valid = x_train, y_train # 1046
x_train, y_train = x_train, y_train # 4182
word_index = tokenizer.word_index # 词表{单词:序号}
num_index = {value: key for key, value in word_index.items()} # 词表{序号:单词}
print(len(x_train),len(x_valid),len(x_test))
# 利用SMOTE过采样处理训练集
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1',random_state=42)
x_sampling, y_sampling = smote.fit_resample(x_train, y_train)
x_train, y_train = x_sampling, y_sampling
# 计算训练集、验证集和测试集中的各个标签样本数
# train_i = np.sum(y_train == i)# 17 22 248 1059 2836
# valid_i = np.sum(y_valid == i)# 4662265709
# test_i = np.sum(y_test == i) # 5778331887
#将训练集、验证集以及测试集中的标签改变
y_pos = np.array()
y_neg = np.array()
y_train_pos = np.array()
y_valid_pos =np.array()
y_test_pos = np.array()
y_train_neg = np.array()
y_valid_neg = np.array()
y_test_neg = np.array()
# 计算pos训练集、验证集和测试集中的正负类样本数
train_pos = np.sum(y_train_pos == 1) # 3895
train_neg = np.sum(y_train_pos == 0) # 287
valid_pos = np.sum(y_valid_pos == 1) # 974
valid_neg = np.sum(y_valid_pos == 0) # 72
test_pos = np.sum(y_test_pos == 1) # 1218
test_neg = np.sum(y_test_pos == 0) # 90
# 计算neg训练集、验证集和测试集中的正负类样本数
train_pos = np.sum(y_train_neg == 1) # 39
train_neg = np.sum(y_train_neg == 0) # 4143
valid_pos = np.sum(y_valid_neg == 1) # 10
valid_neg = np.sum(y_valid_neg == 0) # 1036
test_pos = np.sum(y_test_neg == 1) # 12
test_neg = np.sum(y_test_neg == 0) # 1296
""""
模型训练准备工作
"""
from collections import defaultdict
from tensorflow.keras.layers import (Layer, Input, Embedding, Conv1D, MaxPooling1D,
LSTM, Dense, Dropout, Attention, Bidirectional,
GlobalAveragePooling1D, Concatenate, Dot, Softmax,BatchNormalization,concatenate)
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import Sequence
class CustomDataGenerator(Sequence):
def __init__(self, X_train, y_train, batch_size, class_weights=None):
self.X_train = X_train
self.y_train = y_train
self.batch_size = batch_size
self.class_weights = class_weights
self.num_classes = len(np.unique(y_train))
self.indices_per_class = for i in range(self.num_classes)]
def __len__(self):
return len(self.X_train) // self.batch_size
def __getitem__(self, index):
batch_indices = self.sample_batch_indices()
batch_X = self.X_train
batch_y = self.y_train
# print('Batch class labels:', batch_y)
return batch_X, batch_y
def sample_batch_indices(self):
batch_indices = []
if self.class_weights is None:
for i in range(self.batch_size):
class_index = i % self.num_classes
indices = self.indices_per_class
batch_indices.append(np.random.choice(indices))
else:
for i in range(self.batch_size):
class_index = np.random.choice(self.num_classes, p=self.class_weights)
indices = self.indices_per_class
batch_indices.append(np.random.choice(indices))
return batch_indices
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def f1_score(y_true, y_pred):
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
return 2 * ((p * r) / (p + r + K.epsilon()))
class CustomMetricsCallback(Callback):
def __init__(self, display_step):
super().__init__()
self.display_step = display_step
def on_train_begin(self, logs=None):
self.step = 0
self.custom_history = defaultdict(list)
def on_batch_end(self, batch, logs=None):
self.step += 1
if self.step % self.display_step == 0:
metrics_log = ''
for key, value in logs.items():
self.custom_history.append((self.step, value))
metrics_log += f' - {key}: {value:.4f}'
print(f'Step: {self.step} {metrics_log}')
# 建立模型
def create_model():
inputs = Input(shape=(max_len,))
embed = Embedding(vocab_size, embedding_size, input_length=max_len)(inputs)# (None, max_len, embedding_size)
conv = Conv1D(conv_filters, conv_kernel_size, activation='relu', padding='same')(embed) # (None, max_len, conv_filters)
pool = MaxPooling1D(pool_size=1)(conv)# (None, max_len/2, conv_filters)
lstm = Bidirectional(LSTM(lstm_units, return_sequences=True))(pool)# (None, max_len/2, 2*lstm_units)
# pos分类
attn_pos = Attention(name='attn_pos')()
attn_pos = GlobalAveragePooling1D(name='GAP_pos')(attn_pos)
dropout_pos = Dropout(dropout_rate, name='drop_pos')(attn_pos)
dense_pos = Dense(128, activation='relu', name='dense_pos')(dropout_pos)# shape: (None, 128)
outputs_pos = Dense(1, activation='sigmoid', name='output_pos')(dense_pos)# shape: (None, 1)
# neg分类
attn_neg = Attention(name='attn_neg')()
attn_neg = GlobalAveragePooling1D(name='GAP_neg')(attn_neg)
dropout_neg = Dropout(dropout_rate, name='drop_neg')(attn_neg)
dense_neg = Dense(128, activation='relu', name='dense_neg')(dropout_neg)# shape: (None, 128)
outputs_neg = Dense(1, activation='sigmoid', name='output_neg')(dense_neg)# shape: (None, 1)
model = Model(inputs=inputs, outputs=)
config = tf.compat.v1.ConfigProto(device_count={'GPU': 1})
sess = tf.compat.v1.Session(config=config)
K.set_session(sess)
opt = Adam(learning_rate=learning_rate)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # precision, recall, f1_score
return model
from keras.wrapper.scikit_learn import KerasClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.multioutput import MultiOutputClassifier
from keras.models import load_model
import os
attn_model_pos = []
attn_model_neg = []
model_pos = []
model_neg = []
real_model_pos = []
real_model_neg = []
def train():
# 运行模型
keras_classifier = KerasClassifier(build_fn=create_model, epochs=num_epochs, batch_size=batch_size)
eec = MultiOutputClassifier(EasyEnsembleClassifier(n_estimators=n_estimators, estimator=keras_classifier, random_state=42))
y_train_multioutput=np.c_
history = eec.fit(x_train, y_train_multioutput)
# 访问每个基分类器的attention层, 以及对应的分类层
estimators = eec.estimators_
for idx, estimator in enumerate(estimators):
print(f"Base Classifier {idx+1}: {estimator}")
estimator = estimator.estimators_
print(estimator)
for id, pipline in enumerate(estimator):
real_classifier = pipline.named_steps['classifier']
real_model = real_classifier.model
if idx == 0:
real_model_pos.append(real_model)
attn_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_pos').output)
classify_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('output_pos').output)
attn_model_pos.append(attn_pos)
model_pos.append(classify_pos)
else:
real_model_neg.append(real_model)
attn_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_neg').output)
classify_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('output_neg').output)
attn_model_neg.append(attn_neg)
model_neg.append(classify_neg)
#保存训练好的模型
for i in range(len(real_model_pos)):
model = real_model_pos
dir = f'model/model_pos_{i}.h5'
model.save(dir)
model = real_model_neg
dir = f'model/model_neg_{i}.h5'
model.save(dir)
def load():
# 指定文件夹路径
folder_path = './model/'
# 获取文件夹中的所有文件和文件夹名称列表
file_names = os.listdir(folder_path)
for i in range(int(len(file_names)/2)):
real_model = load_model(f'./model/model_pos_{i}.h5')
attn_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_pos').output)
classify_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('output_pos').output)
attn_model_pos.append(attn_pos)
model_pos.append(classify_pos)
real_model = load_model(f'./model/model_neg_{i}.h5')
attn_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_neg').output)
classify_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('output_neg').output)
attn_model_neg.append(attn_neg)
model_neg.append(classify_neg)
def get_largest_indices(lst, n): # 获取值最大的索引
indices = sorted(range(len(lst)), key=lambda i: lst, reverse=True)
return indices[:n]
word_index = tokenizer.word_index # 词表{单词:序号}
num_index = {value: key for key, value in word_index.items()} # 词表{序号:单词}
# 使用所有基分类器的attention层进行预测
def get_classify_result(ax, tend):
classify_res_lst= []
if tend == 'positive':
for classify_model in model_pos:
result = classify_model.predict(ax)
classify_res_lst.append(result)
else:
for classify_model in model_neg:
result = classify_model.predict(ax)
classify_res_lst.append(result)
return classify_res_lst
def get_all_attention_words_dict(ax, ay, num, tend, y_pred): # 获取attention关注的词语,以字典形式返回
attn_res = []
if tend == 'positive':
for attn_model in attn_model_pos:
attn_result = attn_model.predict(ax)
attn_res.append(attn_result)
else:
for attn_model in attn_model_neg:
attn_result = attn_model.predict(ax)
attn_res.append(attn_result)
attn_res_mean = sum(attn_res)/len(attn_res)
attn_neg = {}
attn_pos = {}
for attn in range(len(attn_res_mean)):
label = ay
pred_label = y_pred
pos = get_largest_indices(attn_res_mean,num)
padded_sentence = for i in ax if i != 0]
sentence = for i in pos if i<len(padded_sentence)]
if pred_label != label:
continue
if label == 1:
for i in sentence:
if attn_pos.get(i) == None:
attn_pos = 1
else:
attn_pos += 1
else:
for i in sentence:
if attn_neg.get(i) == None:
attn_neg = 1
else:
attn_neg += 1
return attn_pos, attn_neg
# 返回attention得分高、出现频率高的词语
def get_max_attn_word(adict):
res = []
for key,value in adict.items():
res.append((key,value))
res = sorted(res, key= lambda review: review, reverse=True)
return res
from sklearn.metrics import classification_report
from collections import Counter
import sklearn.metrics as metrics
def get_res(x, y, tend):
result = get_classify_result(x, tend)
result = sum(result)/len(result)
y_pred = (result >= 0.5).astype(int)
attn_pos, attn_neg = get_all_attention_words_dict(x, y, attention_num, tend, y_pred)
print(Counter(y))
print(metrics.confusion_matrix(y, y_pred))
res = get_max_attn_word(attn_pos)
res_2 = get_max_attn_word(attn_neg)
return res, res_2
#决定是训练还是加载模型
load()
res_pos, _ = get_res(x_test, y_test_pos, 'positive')
res_neg, _ = get_res(x_test, y_test_neg, 'negative')
res_pos, res_2 = get_res(data_x, y_pos, 'positive')
res_neg, res_3 = get_res(data_x, y_neg, 'negative')
with open('attention_word.txt','w') as f:
f.write('低评分关注的词语:(12正类)')
f.write('\n')
for idx, attention in enumerate(res_neg):
f.write(attention+',')
f.write('\n')
f.write('\n')
f.write('此时高评分关注的词语:(12正类)')
f.write('\n')
for idx, attention in enumerate(res_3):
if idx <= int(len(res_neg)):
f.write(attention+',')
f.write('\n')
f.write('\n')
f.write('高评分关注的词语:(45正类)')
f.write('\n')
for idx, attention in enumerate(res_pos):
if idx <= int(len(res_neg)):
f.write(attention+',')
f.write('\n')
f.write('\n')
f.write('此时低评分关注的词语:(45正类)')
f.write('\n')
for idx, attention in enumerate(res_2):
if idx <= int(len(res_neg)):
f.write(attention+',')
报错:
Traceback (most recent call last):
File "C:\Users\admin\Desktop\run.py", line 302, in <module>
from keras.wrapper.scikit_learn import KerasClassifier
ModuleNotFoundError: No module named 'keras.wrapper'
看报错是模块问题,是不是你打错模块名字了或者说导入方式不对? 还是说没有装这个模块
页:
[1]