以下是一个使用TensorFlow 2.0实现Word2Vec的简单示例代码:
import tensorflow as tf
from tensorflow.keras import layers
# 准备数据集(示例)
sentences = [["I", "love", "tensorflow"],
["I", "love", "deep", "learning"],
["I", "enjoy", "coding"]]
# 构建词汇表
vocab = set()
for sentence in sentences:
vocab.update(sentence)
vocab_size = len(vocab)
# 构建词汇到索引的映射表
word2idx = {w: idx for idx, w in enumerate(vocab)}
idx2word = {idx: w for idx, w in enumerate(vocab)}
# 转换句子为索引序列
data = []
for sentence in sentences:
data.append([word2idx[word] for word in sentence])
# 定义模型架构
embedding_dim = 128
model = tf.keras.Sequential([
layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
layers.GlobalAveragePooling1D(),
layers.Dense(units=embedding_dim, activation='relu'),
layers.Dense(units=vocab_size)
])
# 编译模型
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
# 训练模型
epochs = 100
history = model.fit(data, epochs=epochs)
# 获取嵌入向量矩阵(词嵌入)
embedding_matrix = model.layers[0].get_weights()[0]
# 打印词向量结果
for i in range(1, vocab_size):
print("词:", idx2word[i], "\t嵌入向量:", embedding_matrix[i])
这是一个简单的Word2Vec实现示例,包含了模型的构建、编译和训练过程。在训练完成后,可以通过获取嵌入层的权重矩阵来得到每个单词的词向量表示。
内容由零声教学AI助手提供,问题来源于学员提问