4.4 5 投票数

Glove 提供了预训练的数据，矩阵包含 40 万个词向量，每个维数为 50。

Glove 也提供了大量的训练样本和训练程序，网上也能找到大量的训练代码，只需要下载训练数据，然后运行官方提供的脚本，就可以完成训练。不过这一过程将耗费非常长的时间。

wordsList.npy：

wordVectors.npy:

import numpy as np
wordsList = wordsList.tolist()
wordsList = [word.decode('UTF-8') for word in wordsList]
wordVectors = np.load('wordVectors.npy')

print(wordsList.index("hello"))
print(wordVectors[wordsList.index("hello")])

import numpy as np
wordsList = wordsList.tolist()
wordsList = [word.decode('UTF-8') for word in wordsList]
maxLength = 10
numDimensions = 300
s = "my name is hello world"
si = np.zeros(maxLength, dtype='int32')
s = s.strip().split()
for i in range(len(s)):
si[i] = wordsList.index(s[i])
print(si.shape)
print(si)
for i in range(len(s[i])):
print(wordVectors[si[i]])

with open("train.txt", "r", encoding='utf-8') as f:
while True:
if not line:
break
txt = line.split("\t")[2]
counter = len(txt.split())
numWords.append(counter)
import matplotlib.pyplot as plt
plt.hist(numWords, 50)
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.axis([0, 40, 0, 1000])
plt.show()

maxSeqLength = 30

record = "One of my best 8th graders Kory was excited after his touchdown"
tmp = np.zeros(maxSeqLength, dtype='int32')
indexCounter = 0
line = record
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
try:
tmp[indexCounter] = wordsList.index(word)
except ValueError:
tmp[indexCounter] = UNKNOWM  # Vector for unknown words
indexCounter = indexCounter + 1
print(tmp)

for positive_sentence in positive:
print("Processing new positive record", sentence_index)
index_counter = 0
cleaned_sentence = cleanSentences(positive_sentence)
split_sentence = cleaned_sentence.split()
for word in split_sentence:
try:
ids[sentence_index][index_counter] = wordsList.index(word)
except ValueError:
ids[sentence_index][index_counter] = UNKNOWM
index_counter = index_counter + 1
if index_counter >= maxSeqLength:
break
sentence_index = sentence_index + 1

positivetrain.txt中所有的标签为positive的句子，这是一个遍历，依次处理其中的每一个句子。

index_counter记录的是当前处理到的是这个句子的第几个单词。

def get_sentence_ids(sentence, ids, sentence_index):
index_counter = 0
cleaned_sentence = cleanSentences(sentence)
split_sentence = cleaned_sentence.split()
for word in split_sentence:
try:
ids[sentence_index][index_counter] = wordsList.index(word)
except ValueError:
ids[sentence_index][index_counter] = UNKNOWM
index_counter = index_counter + 1
if index_counter >= maxSeqLength:
break

def get_ids():
ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
sentence_index = 0
for sentence in positive:
print("Processing new positive record", sentence_index)
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

for sentence in negative:
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

for sentence in neutral:
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

numWords = []
positive = []
negative = []
neutral = []
with open("train.txt", "r", encoding='utf-8') as f:
while True:
if not line:
break
label = line.split("\t")[1]
text = line.split("\t")[2]
if label == "positive":
positive.append(text)
elif label == "negative":
negative.append(text)
else:
neutral.append(text)
counter = len(text.split())
numWords.append(counter)

def read_file(input_file):

np.save(output_file, ids)
print("Save successfully.")

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

positive = []
negative = []
neutral = []
with open(input_file, "r", encoding='utf-8') as f:
while True:
if not line:
break
label = line.split("\t")[1]
text = line.split("\t")[2]
if label == "positive":
positive.append(text)
elif label == "negative":
negative.append(text)
else:
neutral.append(text)
return positive, negative, neutral

def clean_sentences(string):
string = string.lower().replace("<br />", " ")
return re.sub(strip_special_chars, "", string.lower())

def get_sentence_ids(sentence, ids, sentence_index):
index_counter = 0
cleaned_sentence = clean_sentences(sentence)
split_sentence = cleaned_sentence.split()
for word in split_sentence:
try:
ids[sentence_index][index_counter] = wordsList.index(word)
except ValueError:
ids[sentence_index][index_counter] = UNKNOWM
index_counter = index_counter + 1
if index_counter >= maxSeqLength:
break

def get_ids(input_file, output_file):
num_files = len(positive) + len(negative) + len(neutral)
ids = np.zeros((num_files, maxSeqLength), dtype='int32')
sentence_index = 0
for sentence in positive:
print("Processing new positive record", sentence_index)
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

for sentence in negative:
print("Processing new negative record", sentence_index)
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

for sentence in neutral:
print("Processing new neutral record", sentence_index)
get_sentence_ids(sentence, ids, sentence_index)
sentence_index = sentence_index + 1

np.save(output_file, ids)
print("Save successfully.")

test_ids = np.load('my_idsMatrix.npy')

batchSize = 24
lstmUnits = 64
numClasses = 3
iterations = 100000

def get_train_batch():
accurate_label = []
array = np.zeros([batch_size, max_sequence_length])
for index in range(batch_size):
if index % 3 == 0:
num = randint(1, 3646)
accurate_label.append([1, 0, 0])
elif index % 3 == 1:
num = randint(3647, 5107)
accurate_label.append([0, 1, 0])
else:
num = randint(5108, 9683)
accurate_label.append([0, 0, 1])
array[index] = test_ids[num - 1:num]
return array, accurate_label

def get_validate_batch():
accurate_label = []
array = np.zeros([batch_size, max_sequence_length])
for index in range(batch_size):
num = randint(1, 1654)
if num <= 579:
accurate_label.append([1, 0, 0])
elif num <= 921:
accurate_label.append([0, 1, 0])
else:
accurate_label.append([0, 0, 1])
array[index] = test_ids[num - 1:num]
return array, accurate_label

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batch_size, numClasses])
input_data = tf.placeholder(tf.int32, [batch_size, max_sequence_length])

data = tf.Variable(tf.zeros([batch_size, max_sequence_length, numDimensions]), dtype=tf.float32)

lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses])
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))

accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))

optimizer = tf.train.AdamOptimizer().minimize(loss)

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

tensorboard --logdir=tensorboard

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
#Next Batch of reviews
nextBatch, nextBatchLabels = get_train_batch()
sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})

# Write summary to Tensorboard
if i % 50 == 0:
summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})

# Save the network every 10,000 training iterations
if i % 1000 == 0 and i != 0:
save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
print("saved to %s" % save_path)
writer.close()

sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))

iterations = 100
total = 0
for i in range(iterations):
print("i=", i, " ", end='')
nextBatch, nextBatchLabels = get_validate_batch()
print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
total += (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100

print("Average Accuracy:", total / iterations)

• read_file就只需要依次读入，然后保存，不需要按标签分 3 个列表保存
• 后面处理ids的时候也只需要全部遍历计算，不需要分不同的标签
• Batcher的时候只需要完全随机
• 最后得到结果，只需要找出最大值所在的下标，不需要与标签做比对计算准确率

4.4 5 投票数

101条留言

（可选）如果您也有个人网站，不妨分享一下

101 评论

2019年10月19日 13:49

import numpy as np
wordsList = wordsList.tolist()
wordsList = [word.decode(‘UTF-8’) for word in wordsList]

win10环境，请问是环境不对吗？有什么解决办法吗？

2019年10月19日 14:24

@凝神长老谢谢，我再试一下。我只是个新手，希望借这个项目实践一下。。

http://www.woria.xyz

2019年10月19日 14:28

@凝神长老您新上传的文件可以正常使用了，谢谢。

32332

2023年8月19日 23:02

@eweq一般来说，这些文件可能是特定项目或数据集的一部分。你可以尝试在相应的项目或数据集的官方网站或资源库中查找这些文件。通常，这些文件会在项目的文档或下载页面中提供。

Seraph

2019年5月14日 11:03

2024年3月26日 20:21

wordsList = wordsList.tolist()
wordsList = [word.decode(‘UTF-8’) for word in wordsList]

2019年5月2日 02:04

2019年5月3日 12:28

@凝神长老懂了，谢谢！

66101351

2024年3月20日 09:20

impot numpy as np
wordsList = wordsList.tolist()
wordsList = [word.decode(‘UTF-8’) for word in wordsList]

189****0161
2023年11月23日 20:15

zab

taoyuan
2023年8月15日 21:05

2023年8月19日 23:00

@taoyuan一般来说，这些文件可能是特定项目或数据集的一部分。你可以尝试在相应的项目或数据集的官方网站或资源库中查找这些文件。通常，这些文件会在项目的文档或下载页面中提供。

taoyuan
2023年8月15日 16:24

taoyuan
2023年8月15日 15:41

taoyuan
2023年8月15日 15:36

taoyuan
2023年8月15日 15:08

taoyuan
2023年8月15日 15:03

freemaple456
2023年5月16日 10:14

Eschatology

2022年11月28日 11:40

test

1 5 6 7