deepsequence-最完整的深度学习序列标注工具

简介

deepsequence 是我在工作学习过程中写的一个做序列标注的工具,基于keras,支持各种经典的序列标注任务,如分词,NER(命名实体识别), 为什么是最完整的呢?

  • 支持BERT+CRF 和 BILSTM+CRF两种最优秀的序列标注的算法架构
  • 支持字符级别的特征,charCNN 或者 charLSTM
  • 支持postag特征的输入
  • 支持领域知识的输入,如用于ner识别的公司字典
  • 支持BIO, BIOLU, BIOES三种标注格式
  • 支持通过配置文件快速定制模型细节及参数

架构解析

BILSTM + CRF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
def build_bilstm(self, verbose=True):
"""
build model architecture from parameters
"""
word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input')
inputs = [word_ids]

if self._params.use_pretrain_embedding:
if verbose: logging.info("initial word embedding with pretrained embeddings")
if self._params.word_embedding_dim == 100:
glove_file = self._params.data_dir + '/glove.6B.100d.txt'
elif self._params.word_embedding_dim == 300:
glove_file = self._params.data_dir + '/glove.42B.300d.txt'
else:
logging.error("we only support glove embedding with dimension 100 or 300")
raise ValueError("unmatch word dimension, we only support glove embedding with dimension 100 or 300")
glove_embedding_index = load_glove(glove_file, self._params.word_embedding_dim)
word_vocab = self.input_processor.word_vocab.vocab
glove_embeddings_matrix = np.zeros([len(word_vocab), self._params.word_embedding_dim])
for word, i in word_vocab.items():
vector = glove_embedding_index.get(word)
if vector is not None:
glove_embeddings_matrix[i] = vector

word_embeddings = Embedding(input_dim=glove_embeddings_matrix.shape[0],
output_dim=glove_embeddings_matrix.shape[1],
trainable=False,
mask_zero=True,
weights=[glove_embeddings_matrix],
name='word_embedding')(word_ids)
else:
word_embeddings = Embedding(input_dim=self._params.word_vocab_size,
output_dim=self._params.word_embedding_dim,
mask_zero=True,
name='word_embedding')(word_ids)

input_embeddings = [word_embeddings]
if self._params.use_char:
char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
inputs.append(char_ids)
if self._params.char_feature == "lstm":
char_embeddings = Embedding(input_dim=self._params.char_vocab_size,
output_dim=self._params.char_embedding_dim,
mask_zero=True,
name='char_embedding')(char_ids)
if verbose: logging.info("using charcter level lstm features")
char_feas = TimeDistributed(Bidirectional(LSTM(self._params.char_lstm_size)), name="char_lstm")(char_embeddings)
elif self._params.char_feature == "cnn":
# cnn do not support mask
char_embeddings = Embedding(input_dim=self._params.char_vocab_size,
output_dim=self._params.char_embedding_dim,
name='char_embedding')(char_ids)
if verbose: logging.info("using charcter level cnn features")
char_feas = char_cnn_encode(char_embeddings, self._params.n_gram_filter_sizes, self._params.n_gram_filter_nums)
else:
raise ValueError('char feature must be lstm or cnn')

input_embeddings.append(char_feas)

if self._params.use_pos:
if verbose: logging.info("use pos tag features")
pos_ids = Input(batch_shape=(None, None), dtype='int32', name='pos_input')
inputs.append(pos_ids)


pos_embeddings = Embedding(input_dim=self._params.pos_vocab_size,
output_dim=self._params.pos_embedding_dim,
mask_zero=True,
name='pos_embedding')(pos_ids)
input_embeddings.append(pos_embeddings)

if self._params.use_dict:
if verbose: logging.info("use user dict features")
dict_ids = Input(batch_shape=(None, None), dtype='int32', name='dict_input')
inputs.append(dict_ids)

dict_embeddings = Embedding(input_dim=self._params.dict_vocab_size,
output_dim=self._params.dict_embedding_dim,
mask_zero=True,
name='dict_embedding')(dict_ids)
input_embeddings.append(dict_embeddings)

input_embedding = Concatenate(name="input_embedding")(input_embeddings) if len(input_embeddings)>1 else input_embeddings[0]
input_embedding_ln = LayerNormalization(name='input_layer_normalization')(input_embedding)
#input_embedding_bn = BatchNormalization()(input_embedding_ln)
input_embedding_drop = Dropout(self._params.dropout, name="input_embedding_dropout")(input_embedding_ln)

z = Bidirectional(LSTM(units=self._params.main_lstm_size, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
name="main_bilstm")(input_embedding_drop)
z = Dense(self._params.fc_dim, activation='tanh', name="fc_dense")(z)

if self._params.use_crf:
if verbose: logging.info('use crf decode layer')
crf = CRF(self._params.num_labels, sparse_target=False,
learn_mode='marginal', test_mode='marginal', name='crf_out')
loss = crf.loss_function
pred = crf(z)
else:
loss = 'categorical_crossentropy'
pred = Dense(self._params.num_labels, activation='softmax', name='softmax_out')(z)

model = Model(inputs=inputs, outputs=pred)
model.summary(print_fn=lambda x: logging.info(x + '\n'))
model.compile(loss=loss, optimizer=self._params.optimizer)

self.model = model

BERT + CRF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def build_bert(self, verbose=True):

bert_word_ids = Input(batch_shape=(None, None), dtype="int32", name="bert_word_input")
bert_mask_ids = Input(batch_shape=(None, None), dtype="int32", name='bert_mask_input')
bert_segment_ids = Input(batch_shape=(None, None), dtype="int32", name="bert_segment_input")

inputs = [bert_word_ids, bert_mask_ids, bert_segment_ids]

bert_out = BertLayer(n_fine_tune_layers=self._params.n_fine_tune_layers, bert_path=self._params.bert_path, name="bert_layer")([bert_word_ids, bert_mask_ids, bert_segment_ids])

features = bert_out

if self._params.use_dict:
if verbose: logging.info("use user dict features")
dict_ids = Input(batch_shape=(None, None), dtype='int32', name='dict_input')
inputs.append(dict_ids)

dict_embeddings = Embedding(input_dim=self._params.dict_vocab_size,
output_dim=self._params.dict_embedding_dim,
mask_zero=True,
name='dict_embedding')(dict_ids)

features = Concatenate(name="bert_and_dict_features")([features, dict_embeddings])

z = Dense(self._params.fc_dim, activation='relu', name="fc_dense")(features)

if self._params.use_crf:
if verbose: logging.info('use crf decode layer')
crf = CRF(self._params.num_labels, sparse_target=False,
learn_mode='marginal', test_mode='marginal', name='crf_out')
loss = crf.loss_function
pred = crf(z)
else:
loss = 'categorical_crossentropy'
pred = Dense(self._params.num_labels, activation='softmax', name='softmax_out')(z)

model = Model(inputs=inputs, outputs=pred)
model.summary(print_fn=lambda x: logging.info(x + '\n'))

# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=1e-5,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

model.compile(loss=loss, optimizer=optimizer)

self.model = model

示例

训练NER模型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import argparse
import logging

from deepsequence.dataset import load_conll
from deepsequence.model import SequenceModel
from deepsequence.config import Params
from deepsequence.utils import set_logger


def main(params):

train_file = params.data_dir + '/train.txt'
valid_file = params.data_dir + '/valid.txt'
train_data = load_conll(train_file, params)
valid_data = load_conll(valid_file, params)

model = SequenceModel(params)

model_file = params.data_dir + '/model/model.h5'
if params.continue_previous_training is True:

logging.info("restore model from local")
model.restore(model_file)
else:
logging.info("model initializing...")
model.build(params)

model.fit(train_data, valid_data, verbose=True)
model.evaluate(valid_data)
logging.info("model save to {}".format(model_file))
model.save(model_file)

tf_saved_model_dir = params.data_dir + '/model/tf_saved_model'
model.export_sm(tf_saved_model_dir)


if __name__ == '__main__':

parser = argparse.ArgumentParser(description='''train deep sequence model''')
parser.add_argument('--config', required=True)

set_logger('train.log')

args = parser.parse_args()

logging.info("parse config file path: {}".format(args.config))

params = Params(args.config)
logging.info("parameters: {}".format(params.dict))

try:
main(params)
except Exception as e:
logging.error('run fail', exc_info=True)

配置文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
{
"model": "bilstm",

"vocab_pad": "PAD",
"vocab_unk": "UNK",
"max_sent_len": 1000,
"max_word_len": 24,

"word_embedding_dim": 100,
"use_pretrain_embedding": true,

"use_char": true,
"char_embedding_dim": 30,
"char_feature": "cnn",
"n_gram_filter_sizes": [1, 2, 3, 4, 5],
"n_gram_filter_nums": [5, 10, 20, 30, 35],
"char_lstm_size": 50,

"use_pos": true,
"conll_pos_index": 1,
"pos_embedding_dim": 30,

"use_dict": true,
"dict_embedding_dim": 10,

"dropout": 0.5,

"main_lstm_size": 100,
"fc_dim": 100,
"use_crf": true,
"optimizer": "adam",

"batch_size": 50,
"max_train_epoch": 80,

"early_stop": 80,

"continue_previous_training": false,
"data_dir": "/xxxx/deepsequence/examples/data"
}

更多细节可以可以在项目的GitHub页面查看:

1
https://github.com/yangdc1992/deepsequence