word2vec不适合处理这样的短文本,指的是汽车用户情感分析这样的短文本
释怀的情绪ヽ 2018/10/17 21:18:37
想问个问题/小纠结 genism word2vec 获得词向量后 ,怎么作为lgb的训练数据输入呀
林有夕 2018/10/17 21:23:03
class cate_with_embedding(BaseEstimator, TransformerMixin): def __init__(self, min_df=0.002, max_features=10000): self.min_df = min_df self.max_features = max_features def fit(self, X, y=None): return self def transform(self, X): print(X.shape, '标签编码,把类别根据分布数。低于一定数字的就hash,embedding') def base_word2vec(x, model, size): vec = np.zeros(size) x = [item for item in x if model.wv.__contains__(item)] for item in x: vec += model.wv[item] if len(x) == 0: return vec else: return vec / len(x) for i in ['kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdAction', 'appIdInstall']: X[i].fillna('nan', inplace=True) X[i] = X[i].apply(lambda x: str(x).split(',')) model = Word2Vec(X[i], size=32, workers=30, min_count=100) data_vec = [] for row in X[i]: data_vec.append(base_word2vec(row, model, size=32)) column_names = [] for j in range(32): column_names.append(str(i) + '_' + str(j)) data_vec = pd.DataFrame(data_vec, columns=column_names) X = pd.concat([X, data_vec], axis=1) del X[i] return X仅供参考一下 释怀的情绪ヽ 2018/10/17 21:25:33 好滴谢谢大佬 #### 垃圾邮件分类里的使用 https://github.com/wandouqiang/RubbishMessage/blob/master/src/IG.py #### 达观杯代码里的使用 [训练代码](https://github.com/MLjian/TextClassificationImplement/blob/master/dl/pad/word2vec/train_word2vec.py) 调用代码如下:
#======================================================================================================================= # 2 定义模型/loss/优化器 #======================================================================================================================= f_vectors = open(opt.word_vector_path, 'rb') emb_vectors = torch.Tensor(pickle.load(f_vectors)) model = LSTMsum(emb_weights=emb_vectors, emb_freeze=opt.emb_freeze, input_size=opt.input_size, hidden_size=opt.hidden_size, num_layers=opt.num_layers, l1_size=opt.l1_size, l2_size=opt.l2_size, num_classes=opt.num_classes, bidir=opt.bidir, lstm_dropout=opt.lstm_dropout) if opt.use_gpu: model.cuda() criterion = nn.CrossEntropyLoss()