ZBLOG

tensorflow 实现 word2vec,请给出详细代码

以下是使用TensorFlow实现Word2Vec的简单代码示例:

import numpy as np
import tensorflow as tf

# 定义数据集
corpus = ["I like to play football",
          "I love pizza",
          "I enjoy coding"]

# 创建单词字典
words = [sentence.split() for sentence in corpus]
flatten_words = [word for sublist in words for word in sublist]
vocabulary = list(set(flatten_words))
vocab_size = len(vocabulary)

# 构建word2idx和idx2word映射表
word2idx = {w: i for i, w in enumerate(vocabulary)}
idx2word = {i: w for i, w in enumerate(vocabulary)}

# 转换为训练样本(输入是上下文单词,输出是目标单词)
data = []
for sentence in words:
    idxs = [word2idx[word] for word in sentence]
    for center_word_pos in range(len(idxs)):
        context_word_idxs = [idxs[i] for i in range(center_word_pos - 1, center_word_pos + 2)
                             if 0 <= i < len(idxs) and i != center_word_pos]
        data.append((context_word_idxs, idxs[center_word_pos]))

# 定义模型参数
embedding_dim = 128
learning_rate = 0.1

# 定义placeholder和变量
context_inputs = tf.placeholder(tf.int32, shape=[None, 2])
target_input = tf.placeholder(tf.int32, shape=[None])
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_dim], -1.0, 1.0))

# 获取上下文单词的嵌入表示
context_embeds = tf.reduce_mean(tf.nn.embedding_lookup(embeddings, context_inputs), axis=1)

# 定义损失函数和优化器
softmax_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_dim], stddev=0.1))
softmax_biases = tf.Variable(tf.zeros([vocab_size]))
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
    logits=tf.matmul(context_embeds, tf.transpose(softmax_weights)) + softmax_biases,
    labels=target_input))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# 训练模型
num_epochs = 1000
batch_size = 16

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        np.random.shuffle(data)
        total_loss = 0
        
        for start in range(0, len(data), batch_size):
            end = start + batch_size
            batch_data = data[start:end]
            
            contexts_batch, targets_batch = zip(*batch_data)
            
            _, cur_loss = sess.run([optimizer, loss],
                                   feed_dict={context_inputs: np.array(contexts_batch),
                                              target_input: np.array(targets_batch)})
            total_loss += cur_loss
            
        if (epoch+1) % 100 == 0:
            print("Epoch:", '%04d' % (epoch+1), "Avg. Loss=", "{:.4f}".format(total_loss / len(data)))
    
    # 获取训练后的词向量
    trained_embeddings = sess.run(embeddings)

# 输出单词及其对应的词向量
for i in range(vocab_size):
    word = idx2word[i]
    embedding_vector = trained_embeddings[i]
    print(word, embedding_vector)

这只是一个简单的Word2Vec实现,仅供参考。在实际应用中,可能需要更复杂的模型和更大规模的语料库来获得更好的结果。


内容由零声教学AI助手提供,问题来源于学员提问

本站部分文章来源于网络,版权归原作者所有,如有侵权请联系站长删除。
转载请注明出处:https://sdn.0voice.com/?id=1915

分享:
扫描分享到社交APP
上一篇
下一篇
发表列表
游客 游客
此处应有掌声~
评论列表

还没有评论,快来说点什么吧~

联系我们

在线咨询: 点击这里给我发消息

微信号:3007537140

上班时间: 10:30-22:30

关注我们
x

注册

已经有帐号?