1.单词级的one-hot编码(numpy编写)

#单词级别的one-hot编码import numpy as npsamples=['this cat sat on the mat',"this dog ate my homework"]token_index={   }#构造一个空的索引集合for sample in samples:    for word in sample.split():#将句子拆分成一个一个单词        if word not in token_index:            token_index[word]=len(token_index)+1#为每个唯一单词指定一个唯一索引max_length=1#对样本进行分词，只考虑每个样本前max-length个单词results=np.zeros(shape=(len(samples),max_length,max(token_index.values())+1))#将结果保存在results中for i ,sample in enumerate(samples):    for j ,word in list(enumerate(sample.split()))[:max_length]:        index=token_index.get(word)        results[i,j,index]=1print(results)

结果：

从图中可以看出one-hot矩阵是（15,11）的，index为（this， cat, sat, on, the， mat, dog , ate,…）

列名为token_index

2.利用Keras实现one-hot编码（keras内置函数）

from keras_preprocessing.text import Tokenizersamples=['this cat sat on the mat',"this dog ate my homework"]tokenizer=Tokenizer(num_words=1000)#创建一个分词器，设置只考虑前1000个单词tokenizer.fit_on_texts(samples)#构建单词索引sequences = tokenizer.texts_to_sequences(samples)  # 将字符串转换为整数索引组成的列表one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary') # 也可以直接得到 one-hot 二进制表示。这个分词器也支持除 one-hot 编码外的其他向量化模式word_index = tokenizer.word_index  # 找回单词索引print("sequences\n",sequences)print("word_index\n",word_index )print("one_hot_results[0]\n",one_hot_results[0])print(one_hot_results)