鱼C论坛

 找回密码
 立即注册
查看: 747|回复: 2

请各位大佬帮忙看下,是什么原因报错,以及怎么修正

[复制链接]
发表于 2023-9-25 20:38:05 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
代码:
import os
import re
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

"""
设置超参数
"""
max_len = 256
vocab_size = 10000
embedding_size = 128
conv_filters = 128
conv_kernel_size = 5
lstm_units = 128
dropout_rate = 0.2

num_epochs = 5
batch_size = 16
learning_rate = 1e-2

sampling_strategy = {1:1000, 2:1000}
attention_num = 50
n_estimators = 2

"""
加载数据
"""
import pandas as pd
import numpy as np

file_path = 'C:/Users/admin/Desktop/文学评论/文学评论数据/csv版/'

def Read_csv(file_name, num):
    data = pd.read_csv(file_path + file_name, encoding_errors="ignore", dtype='str')
    reviews = data["内容"]
    data['labels'] = num
    labels = data['labels']
    return reviews, labels

reviews_1, labels_1 = Read_csv('1分.csv', 1) # 26
reviews_2, labels_2 = Read_csv('2分.csv', 2) # 35
reviews_3, labels_3 = Read_csv('3分.csv', 3) # 388
reviews_4, labels_4 = Read_csv('4分.csv', 4) # 1655
reviews_5, labels_5 = Read_csv('5分.csv', 5) # 4432

"""
分词
"""
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 停用词
def get_stopword_list(file):
    with open(file, 'r', encoding='utf-8') as f:
        stopword_list = [word.strip('\n') for word in f.readlines()]
    return stopword_list
stopword_list = get_stopword_list('stopwords_hit.txt')

# 标点符号列表
import string
punctuation = set(string.punctuation)
ch_punctuation = set('"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。')
punctuation = punctuation | ch_punctuation

def remove_non_utf8_chars(text):
    # 将字符串编码为字节对象,忽略无法编码的字符
    encoded_text = text.encode('utf-8')
    # 将字节对象解码回UTF-8格式的字符串
    decoded_text = encoded_text.decode('utf-8')
    return decoded_text

def solve_negative_words(sentence): # 处理否定词等
    text = []
    flag = 0
    negative_words = ['不','没','好','很','最','太','才','还','都','上','去']#,'好','很','最','太','才','还','都','上','去'
    for pos in range(len(sentence)):
        word = sentence[pos]
        if word in negative_words and pos <= len(sentence)-2 and sentence[pos+1] not in punctuation:
            flag = 1
            continue
        if flag == 1 and word not in punctuation:
            word = sentence[pos-1] + word
            flag = 0
        text.append(word)
    return text  

data_x = []
def get_data_cut(reviews):
    reviews = [(ele if isinstance(ele, str) else str(ele)) for ele in reviews]
    reviews = [remove_non_utf8_chars(sent) for sent in reviews]
    reviews = [re.sub(r'\d',"",sent) for sent in reviews] # 去除字符中的数字
    reviews = [re.sub(r'[a-z]',"",sent,flags=re.I) for sent in reviews] # 去除字符中的英文单词
    data_cut = [jieba.lcut(sent) for sent in reviews]
    stop_usual = ['\n',' ']
    for words in data_cut:
        temp = []
        words = solve_negative_words(words)
        for pos in range(len(words)):
            word = words[pos]         
            if word not in stopword_list and word not in punctuation and word not in stop_usual:
                temp.append(word)
        data_x.append(temp)
    return 0

for i in range(1,6):
    get_data_cut(eval('reviews_' + str(i)))

"""
处理数据
"""
tokenizer = Tokenizer(num_words=vocab_size)
print(data_x[:5])
tokenizer.fit_on_texts(data_x) # 建立词表
data_ids = tokenizer.texts_to_sequences(data_x) # 将文本根据词汇表转换为一串数字
data_padded = pad_sequences(data_ids, maxlen=max_len, padding="post", truncating="post")
data_x = data_padded

labels = []
for i in range(1,6):
    labels.extend(eval('labels_' + str(i)))
data_y = np.array(labels)

print(data_x[:5])
print(data_y[:5])
"""
划分数据集
"""
from sklearn.model_selection import (StratifiedShuffleSplit)

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, test_indices = next(splitter.split(data_x, data_y))
x_train, y_train = data_x[train_indices], data_y[train_indices]
x_test, y_test = data_x[test_indices], data_y[test_indices] # 1308

# 将训练集划分为训练集和验证集
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, valid_indices = next(splitter.split(x_train, y_train))
x_valid, y_valid = x_train[valid_indices], y_train[valid_indices] # 1046
x_train, y_train = x_train[train_indices], y_train[train_indices] # 4182

word_index = tokenizer.word_index # 词表{单词:序号}
num_index = {value: key for key, value in word_index.items()} # 词表{序号:单词}
print(len(x_train),len(x_valid),len(x_test))

# 利用SMOTE过采样处理训练集
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1',random_state=42)
x_sampling, y_sampling = smote.fit_resample(x_train, y_train)
x_train, y_train = x_sampling, y_sampling

# 计算训练集、验证集和测试集中的各个标签样本数
# train_i = np.sum(y_train == i)  # 17 22 248 1059 2836
# valid_i = np.sum(y_valid == i)  # 4  6  62  265  709
# test_i = np.sum(y_test == i)    # 5  7  78  331  887

#将训练集、验证集以及测试集中的标签改变
y_pos = np.array([0 if i<=3 else 1 for i in data_y])
y_neg = np.array([0 if i>=3 else 1 for i in data_y])

y_train_pos = np.array([0 if i<=3 else 1 for i in y_train])
y_valid_pos =np.array([0 if i<=3 else 1 for i in y_valid])
y_test_pos = np.array([0 if i<=3 else 1 for i in y_test])

y_train_neg = np.array([0 if i>=3 else 1 for i in y_train])
y_valid_neg = np.array([0 if i>=3 else 1 for i in y_valid])
y_test_neg = np.array([0 if i>=3 else 1 for i in y_test])

# 计算pos训练集、验证集和测试集中的正负类样本数
train_pos = np.sum(y_train_pos == 1) # 3895
train_neg = np.sum(y_train_pos == 0) # 287
valid_pos = np.sum(y_valid_pos == 1) # 974
valid_neg = np.sum(y_valid_pos == 0) # 72
test_pos = np.sum(y_test_pos == 1) # 1218
test_neg = np.sum(y_test_pos == 0) # 90

# 计算neg训练集、验证集和测试集中的正负类样本数
train_pos = np.sum(y_train_neg == 1) # 39
train_neg = np.sum(y_train_neg == 0) # 4143
valid_pos = np.sum(y_valid_neg == 1) # 10
valid_neg = np.sum(y_valid_neg == 0) # 1036
test_pos = np.sum(y_test_neg == 1) # 12
test_neg = np.sum(y_test_neg == 0) # 1296

""""
模型训练准备工作
"""
from collections import defaultdict
from tensorflow.keras.layers import (Layer, Input, Embedding, Conv1D, MaxPooling1D,
                                     LSTM, Dense, Dropout, Attention, Bidirectional,
                                     GlobalAveragePooling1D, Concatenate, Dot, Softmax,BatchNormalization,concatenate)
from tensorflow.keras.models import Model

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import Sequence

class CustomDataGenerator(Sequence):
    def __init__(self, X_train, y_train, batch_size, class_weights=None):
        self.X_train = X_train
        self.y_train = y_train
        self.batch_size = batch_size
        self.class_weights = class_weights

        self.num_classes = len(np.unique(y_train))
        self.indices_per_class = [np.where(y_train == i)[0] for i in range(self.num_classes)]

    def __len__(self):
        return len(self.X_train) // self.batch_size

    def __getitem__(self, index):
        batch_indices = self.sample_batch_indices()
        batch_X = self.X_train[batch_indices]
        batch_y = self.y_train[batch_indices]
        # print('Batch class labels:', batch_y)

        return batch_X, batch_y

    def sample_batch_indices(self):
        batch_indices = []

        if self.class_weights is None:
            for i in range(self.batch_size):
                class_index = i % self.num_classes
                indices = self.indices_per_class[class_index]
                batch_indices.append(np.random.choice(indices))
        else:
            for i in range(self.batch_size):
                class_index = np.random.choice(self.num_classes, p=self.class_weights)
                indices = self.indices_per_class[class_index]
                batch_indices.append(np.random.choice(indices))

        return batch_indices

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))

class CustomMetricsCallback(Callback):
    def __init__(self, display_step):
        super().__init__()
        self.display_step = display_step

    def on_train_begin(self, logs=None):
        self.step = 0
        self.custom_history = defaultdict(list)

    def on_batch_end(self, batch, logs=None):
        self.step += 1
        if self.step % self.display_step == 0:
            metrics_log = ''
            for key, value in logs.items():
                self.custom_history[key].append((self.step, value))
                metrics_log += f' - {key}: {value:.4f}'
            print(f'Step: {self.step} {metrics_log}')

# 建立模型
def create_model():
    inputs = Input(shape=(max_len,))
    embed = Embedding(vocab_size, embedding_size, input_length=max_len)(inputs)# (None, max_len, embedding_size)
    conv = Conv1D(conv_filters, conv_kernel_size, activation='relu', padding='same')(embed) # (None, max_len, conv_filters)
    pool = MaxPooling1D(pool_size=1)(conv)# (None, max_len/2, conv_filters)
    lstm = Bidirectional(LSTM(lstm_units, return_sequences=True))(pool)# (None, max_len/2, 2*lstm_units)
    # pos分类
    attn_pos = Attention(name='attn_pos')([lstm, lstm])
    attn_pos = GlobalAveragePooling1D(name='GAP_pos')(attn_pos)
    dropout_pos = Dropout(dropout_rate, name='drop_pos')(attn_pos)
    dense_pos = Dense(128, activation='relu', name='dense_pos')(dropout_pos)# shape: (None, 128)
    outputs_pos = Dense(1, activation='sigmoid', name='output_pos')(dense_pos)# shape: (None, 1)
    # neg分类
    attn_neg = Attention(name='attn_neg')([lstm, lstm])
    attn_neg = GlobalAveragePooling1D(name='GAP_neg')(attn_neg)
    dropout_neg = Dropout(dropout_rate, name='drop_neg')(attn_neg)
    dense_neg = Dense(128, activation='relu', name='dense_neg')(dropout_neg)# shape: (None, 128)
    outputs_neg = Dense(1, activation='sigmoid', name='output_neg')(dense_neg)# shape: (None, 1)

    model = Model(inputs=inputs, outputs=[outputs_pos,outputs_neg])

    config = tf.compat.v1.ConfigProto(device_count={'GPU': 1})
    sess = tf.compat.v1.Session(config=config)
    K.set_session(sess)

    opt = Adam(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # precision, recall, f1_score
    return model

from keras.wrapper.scikit_learn import KerasClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.multioutput import MultiOutputClassifier
from keras.models import load_model
import os
attn_model_pos = []
attn_model_neg = []
model_pos = []
model_neg = []
real_model_pos = []
real_model_neg = []

def train():
    # 运行模型
    keras_classifier = KerasClassifier(build_fn=create_model, epochs=num_epochs, batch_size=batch_size)
    eec = MultiOutputClassifier(EasyEnsembleClassifier(n_estimators=n_estimators, estimator=keras_classifier, random_state=42))
    y_train_multioutput=np.c_[y_train_pos,y_train_neg]
    history = eec.fit(x_train, y_train_multioutput)
    # 访问每个基分类器的attention层, 以及对应的分类层
    estimators = eec.estimators_
    for idx, estimator in enumerate(estimators):
        print(f"Base Classifier {idx+1}: {estimator}")
        estimator = estimator.estimators_
        print(estimator)
        for id, pipline in enumerate(estimator):
            real_classifier = pipline.named_steps['classifier']
            real_model = real_classifier.model
            if idx == 0:
                real_model_pos.append(real_model)
                attn_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_pos').output)
                classify_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('output_pos').output)
                attn_model_pos.append(attn_pos)
                model_pos.append(classify_pos)
            else:
                real_model_neg.append(real_model)
                attn_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_neg').output)
                classify_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('output_neg').output)
                attn_model_neg.append(attn_neg)
                model_neg.append(classify_neg)
    #保存训练好的模型
    for i in range(len(real_model_pos)):
        model = real_model_pos[i]
        dir = f'model/model_pos_{i}.h5'
        model.save(dir)

        model = real_model_neg[i]
        dir = f'model/model_neg_{i}.h5'
        model.save(dir)

def load():
    # 指定文件夹路径
    folder_path = './model/'
    # 获取文件夹中的所有文件和文件夹名称列表
    file_names = os.listdir(folder_path)
    for i in range(int(len(file_names)/2)):
        real_model = load_model(f'./model/model_pos_{i}.h5')
        attn_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_pos').output)
        classify_pos = Model(inputs=real_model.input, outputs=real_model.get_layer('output_pos').output)
        attn_model_pos.append(attn_pos)
        model_pos.append(classify_pos)

        real_model = load_model(f'./model/model_neg_{i}.h5')
        attn_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('drop_neg').output)
        classify_neg = Model(inputs=real_model.input, outputs=real_model.get_layer('output_neg').output)
        attn_model_neg.append(attn_neg)
        model_neg.append(classify_neg)

def get_largest_indices(lst, n): # 获取值最大的索引
    indices = sorted(range(len(lst)), key=lambda i: lst[i], reverse=True)
    return indices[:n]

word_index = tokenizer.word_index # 词表{单词:序号}
num_index = {value: key for key, value in word_index.items()} # 词表{序号:单词}

# 使用所有基分类器的attention层进行预测
def get_classify_result(ax, tend):
    classify_res_lst= []
    if tend == 'positive':
        for classify_model in model_pos:
            result = classify_model.predict(ax)
            classify_res_lst.append(result)
    else:
        for classify_model in model_neg:
            result = classify_model.predict(ax)
            classify_res_lst.append(result)
    return classify_res_lst

def get_all_attention_words_dict(ax, ay, num, tend, y_pred): # 获取attention关注的词语,以字典形式返回
    attn_res = []
    if tend == 'positive':
        for attn_model in attn_model_pos:
            attn_result = attn_model.predict(ax)
            attn_res.append(attn_result)
    else:
        for attn_model in attn_model_neg:
            attn_result = attn_model.predict(ax)
            attn_res.append(attn_result)
    attn_res_mean = sum(attn_res)/len(attn_res)
    attn_neg = {}
    attn_pos = {}
    for attn in range(len(attn_res_mean)):
        label = ay[attn]
        pred_label = y_pred[attn]
        pos = get_largest_indices(attn_res_mean[attn],num)
        padded_sentence = [num_index[i] for i in ax[attn] if i != 0]
        sentence = [padded_sentence[i] for i in pos if i<len(padded_sentence)]
        if pred_label != label:
            continue
        if label == 1:
            for i in sentence:
                if attn_pos.get(i) == None:
                    attn_pos[i] = 1
                else:
                    attn_pos[i] += 1
        else:
            for i in sentence:
                if attn_neg.get(i) == None:
                    attn_neg[i] = 1
                else:
                    attn_neg[i] += 1
    return attn_pos, attn_neg

# 返回attention得分高、出现频率高的词语
def get_max_attn_word(adict):
    res = []
    for key,value in adict.items():
        res.append((key,value))
    res = sorted(res, key= lambda review: review[1], reverse=True)
    return res

from sklearn.metrics import classification_report
from collections import Counter
import sklearn.metrics as metrics
def get_res(x, y, tend):
    result = get_classify_result(x, tend)
    result = sum(result)/len(result)
    y_pred = (result >= 0.5).astype(int)
    attn_pos, attn_neg = get_all_attention_words_dict(x, y, attention_num, tend, y_pred)
    print(Counter(y))
    print(metrics.confusion_matrix(y, y_pred))
    res = get_max_attn_word(attn_pos)
    res_2 = get_max_attn_word(attn_neg)
    return res, res_2

#决定是训练还是加载模型
load()
res_pos, _ = get_res(x_test, y_test_pos, 'positive')
res_neg, _ = get_res(x_test, y_test_neg, 'negative')   

res_pos, res_2 = get_res(data_x, y_pos, 'positive')
res_neg, res_3 = get_res(data_x, y_neg, 'negative')

with open('attention_word.txt','w') as f:
    f.write('低评分关注的词语:(12正类)')
    f.write('\n')
    for idx, attention in enumerate(res_neg):
        f.write(attention[0]+',')
    f.write('\n')
    f.write('\n')

    f.write('此时高评分关注的词语:(12正类)')
    f.write('\n')
    for idx, attention in enumerate(res_3):
        if idx <= int(len(res_neg)):
            f.write(attention[0]+',')
    f.write('\n')
    f.write('\n')

    f.write('高评分关注的词语:(45正类)')
    f.write('\n')
    for idx, attention in enumerate(res_pos):
        if idx <= int(len(res_neg)):
            f.write(attention[0]+',')
    f.write('\n')
    f.write('\n')

    f.write('此时低评分关注的词语:(45正类)')
    f.write('\n')
    for idx, attention in enumerate(res_2):
        if idx <= int(len(res_neg)):
            f.write(attention[0]+',')
报错:
Traceback (most recent call last):
  File "C:\Users\admin\Desktop\run.py", line 302, in <module>
    from keras.wrapper.scikit_learn import KerasClassifier
ModuleNotFoundError: No module named 'keras.wrapper'

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2023-9-29 11:13:49 | 显示全部楼层
看报错是模块问题,是不是你打错模块名字了或者说导入方式不对?
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

发表于 2023-9-29 11:14:42 | 显示全部楼层
还是说没有装这个模块
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-9-21 10:53

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表