以下是使用TensorFlow实现Word2Vec的简单代码示例:
import numpy as np
import tensorflow as tf
# 定义数据集
corpus = ["I like to play football",
"I love pizza",
"I enjoy coding"]
# 创建单词字典
words = [sentence.split() for sentence in corpus]
flatten_words = [word for sublist in words for word in sublist]
vocabulary = list(set(flatten_words))
vocab_size = len(vocabulary)
# 构建word2idx和idx2word映射表
word2idx = {w: i for i, w in enumerate(vocabulary)}
idx2word = {i: w for i, w in enumerate(vocabulary)}
# 转换为训练样本(输入是上下文单词,输出是目标单词)
data = []
for sentence in words:
idxs = [word2idx[word] for word in sentence]
for center_word_pos in range(len(idxs)):
context_word_idxs = [idxs[i] for i in range(center_word_pos - 1, center_word_pos + 2)
if 0 <= i < len(idxs) and i != center_word_pos]
data.append((context_word_idxs, idxs[center_word_pos]))
# 定义模型参数
embedding_dim = 128
learning_rate = 0.1
# 定义placeholder和变量
context_inputs = tf.placeholder(tf.int32, shape=[None, 2])
target_input = tf.placeholder(tf.int32, shape=[None])
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_dim], -1.0, 1.0))
# 获取上下文单词的嵌入表示
context_embeds = tf.reduce_mean(tf.nn.embedding_lookup(embeddings, context_inputs), axis=1)
# 定义损失函数和优化器
softmax_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_dim], stddev=0.1))
softmax_biases = tf.Variable(tf.zeros([vocab_size]))
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=tf.matmul(context_embeds, tf.transpose(softmax_weights)) + softmax_biases,
labels=target_input))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# 训练模型
num_epochs = 1000
batch_size = 16
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(num_epochs):
np.random.shuffle(data)
total_loss = 0
for start in range(0, len(data), batch_size):
end = start + batch_size
batch_data = data[start:end]
contexts_batch, targets_batch = zip(*batch_data)
_, cur_loss = sess.run([optimizer, loss],
feed_dict={context_inputs: np.array(contexts_batch),
target_input: np.array(targets_batch)})
total_loss += cur_loss
if (epoch+1) % 100 == 0:
print("Epoch:", '%04d' % (epoch+1), "Avg. Loss=", "{:.4f}".format(total_loss / len(data)))
# 获取训练后的词向量
trained_embeddings = sess.run(embeddings)
# 输出单词及其对应的词向量
for i in range(vocab_size):
word = idx2word[i]
embedding_vector = trained_embeddings[i]
print(word, embedding_vector)
这只是一个简单的Word2Vec实现,仅供参考。在实际应用中,可能需要更复杂的模型和更大规模的语料库来获得更好的结果。
内容由零声教学AI助手提供,问题来源于学员提问