@@ -93,6 +93,7 @@ def build_vocab(nested_list):
93
93
"""
94
94
# Build vocabulary
95
95
word_counts = Counter (itertools .chain (* nested_list ))
96
+ logging .getLogger ().info ("build_vocab: word_counts=%d" % (len (word_counts )))
96
97
97
98
# Mapping from index to label
98
99
vocabulary_inv = [x [0 ] for x in word_counts .most_common ()]
@@ -114,6 +115,7 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)
114
115
:param buckets: size of each bucket in the iterators
115
116
:return: train_iter, val_iter, word_to_index, index_to_word, pos_to_index, index_to_pos
116
117
"""
118
+
117
119
# Read in data as numpy array
118
120
df = pd .read_pickle (os .path .join (data_dir , "ner_data.pkl" ))[:max_records ]
119
121
@@ -135,12 +137,14 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)
135
137
136
138
# Split into training and testing data
137
139
idx = int (len (indexed_tokens )* train_fraction )
140
+ logging .info ("Preparing train/test datasets splitting at idx %d on total %d sentences using a batchsize of %d" , idx , len (indexed_tokens ), batch_size )
138
141
X_token_train , X_char_train , Y_train = indexed_tokens [:idx ], indexed_chars [:idx ], indexed_entities [:idx ]
139
142
X_token_test , X_char_test , Y_test = indexed_tokens [idx :], indexed_chars [idx :], indexed_entities [idx :]
140
143
141
144
# build iterators to feed batches to network
142
145
train_iter = iterators .BucketNerIter (sentences = X_token_train , characters = X_char_train , label = Y_train ,
143
146
max_token_chars = 5 , batch_size = batch_size , buckets = buckets )
147
+ logging .info ("Creating the val_iter using %d sentences" , len (X_token_test ))
144
148
val_iter = iterators .BucketNerIter (sentences = X_token_test , characters = X_char_test , label = Y_test ,
145
149
max_token_chars = train_iter .max_token_chars , batch_size = batch_size , buckets = train_iter .buckets )
146
150
return train_iter , val_iter , word_to_index , char_to_index , entity_to_index
@@ -205,6 +209,8 @@ def sym_gen(seq_len):
205
209
def train (train_iter , val_iter ):
206
210
import metrics
207
211
devs = mx .cpu () if args .gpus is None or args .gpus is '' else [mx .gpu (int (i )) for i in args .gpus .split (',' )]
212
+ logging .info ("train on device %s using optimizer %s at learningrate %f for %d epochs using %d records: lstm_state_size=%d ..." ,
213
+ devs , args .optimizer , args .lr , args .num_epochs , args .max_records , args .lstm_state_size )
208
214
module = mx .mod .BucketingModule (sym_gen , train_iter .default_bucket_key , context = devs )
209
215
module .fit (train_data = train_iter ,
210
216
eval_data = val_iter ,
@@ -225,6 +231,8 @@ def train(train_iter, val_iter):
225
231
train_iter , val_iter , word_to_index , char_to_index , entity_to_index = build_iters (args .data_dir , args .max_records ,
226
232
args .train_fraction , args .batch_size , args .buckets )
227
233
234
+ logging .info ("validation iterator: %s" , val_iter )
235
+
228
236
# Define the recurrent layer
229
237
bi_cell = mx .rnn .SequentialRNNCell ()
230
238
for layer_num in range (args .lstm_layers ):
0 commit comments