Add pretrain processing into dygraph bert model, test=release/1.8 (#4718)

pangyoki · web-flow · commit b87761f8100a · 2020-06-28T00:15:40.000+08:00
diff --git a/dygraph/bert/model/bert.py b/dygraph/bert/model/bert.py
@@ -230,37 +230,37 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask,
 
         enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
                                                      sentence_ids, input_mask)
+
         reshaped_emb_out = fluid.layers.reshape(
             x=enc_output, shape=[-1, self._emb_size])
 
         mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
         mask_trans_feat = self.pooled_fc(mask_feat)
-        mask_trans_feat = self.pre_process_layer(None, mask_trans_feat, "n",
-                                                 self._prepostprocess_dropout)
+        mask_trans_feat = self.pre_process_layer(mask_trans_feat)
 
         if self._weight_sharing:
             fc_out = fluid.layers.matmul(
                 x=mask_trans_feat,
-                y=self.bert_layer._src_emb._w,
+                y=self.bert_layer._src_emb.weight,
                 transpose_y=True)
             fc_out += self.fc_create_params
         else:
             fc_out = self.out_fc(mask_trans_feat)
 
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-            logits=fc_out, label=mask_label)
+        mask_lm_loss, mask_lm_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label, return_softmax=True)
         mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
 
         next_sent_fc_out = self.next_sent_fc(next_sent_feat)
 
         next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
             logits=next_sent_fc_out, label=labels, return_softmax=True)
 
+        lm_acc = fluid.layers.accuracy(input=mask_lm_softmax, label=mask_label)
+
         next_sent_acc = fluid.layers.accuracy(
             input=next_sent_softmax, label=labels)
-
         mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
 
         loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
+        return lm_acc, next_sent_acc, mean_mask_lm_loss, loss
diff --git a/dygraph/bert/run_train_multi_gpu.sh b/dygraph/bert/run_train_multi_gpu.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# pretrain config
+SAVE_STEPS=10000
+BATCH_SIZE=4096
+LR_RATE=1e-4
+WEIGHT_DECAY=0.01
+MAX_LEN=512
+TRAIN_DATA_DIR=data/train
+VALIDATION_DATA_DIR=data/validation
+CONFIG_PATH=data/demo_config/bert_config.json
+VOCAB_PATH=data/demo_config/vocab.txt
+# Change your train arguments:
+GPU_TO_USE=0,1
+# start pretrain
+python -m paddle.distributed.launch --selected_gpus=$GPU_TO_USE --log_dir ./pretrain_log ./train.py ${is_distributed}\
+        --use_cuda true\
+        --use_data_parallel true\
+        --weight_sharing true\
+        --batch_size ${BATCH_SIZE} \
+        --data_dir ${TRAIN_DATA_DIR} \
+        --validation_set_dir ${VALIDATION_DATA_DIR} \
+        --bert_config_path ${CONFIG_PATH} \
+        --vocab_path ${VOCAB_PATH} \
+        --generate_neg_sample true\
+        --checkpoints ./output \
+        --save_steps ${SAVE_STEPS} \
+        --learning_rate ${LR_RATE} \
+        --weight_decay ${WEIGHT_DECAY:-0} \
+        --max_seq_len ${MAX_LEN} \
+        --skip_steps 20 \
+        --validation_steps 1000 \
+        --num_iteration_per_drop_scope 10 \
+        --use_fp16 false \
+        --verbose true       
diff --git a/dygraph/bert/run_train_single_gpu.sh b/dygraph/bert/run_train_single_gpu.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# pretrain config
+SAVE_STEPS=100
+BATCH_SIZE=4096
+LR_RATE=1e-4
+WEIGHT_DECAY=0.01
+MAX_LEN=512
+TRAIN_DATA_DIR=data/train
+VALIDATION_DATA_DIR=data/validation
+CONFIG_PATH=data/demo_config/bert_config.json
+VOCAB_PATH=data/demo_config/vocab.txt
+# Change your train arguments:
+# start pretrain
+python -u ./train.py --use_cuda true\
+        --use_data_parallel false\
+        --weight_sharing true\
+        --batch_size ${BATCH_SIZE} \
+        --data_dir ${TRAIN_DATA_DIR} \
+        --validation_set_dir ${VALIDATION_DATA_DIR} \
+        --bert_config_path ${CONFIG_PATH} \
+        --vocab_path ${VOCAB_PATH} \
+        --generate_neg_sample true\
+        --checkpoints ./output \
+        --save_steps ${SAVE_STEPS} \
+        --learning_rate ${LR_RATE} \
+        --weight_decay ${WEIGHT_DECAY:-0} \
+        --max_seq_len ${MAX_LEN} \
+        --skip_steps 20 \
+        --validation_steps 100 \
+        --num_iteration_per_drop_scope 10 \
+        --use_fp16 false \
+        --verbose true
diff --git a/dygraph/bert/train.py b/dygraph/bert/train.py