From 86d989ed0862c7700d8fd25aa8d2e954aa9618c9 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Wed, 16 Oct 2019 09:34:32 +0800
Subject: [PATCH 01/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=EF=BC=8C=E8=B7=91=E8=87=AA=E6=9C=89=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 egs/aishell/asr1/conf/fbank.conf              |   2 +-
 egs/aishell/asr1/conf/pitch.conf              |   2 +-
 egs/aishell/asr1/infer.sh                     | 255 ++++++++++++++++++
 egs/aishell/asr1/local/aishell_data_prep.sh   |   2 +-
 .../asr1/local/aishell_data_prep_infer.sh     |  66 +++++
 egs/aishell/asr1/run.sh                       |  25 +-
 .../tts1/conf/tuning/train_fastspeech.v1.yaml |   2 +-
 .../tuning/train_pytorch_transformer.v1.yaml  |   2 +-
 egs/ljspeech/tts1/local/text/cleaners.py      |   4 +-
 egs/ljspeech/tts1/run.sh                      |  24 +-
 tools/Makefile                                |  10 +-
 11 files changed, 361 insertions(+), 33 deletions(-)
 create mode 100755 egs/aishell/asr1/infer.sh
 create mode 100755 egs/aishell/asr1/local/aishell_data_prep_infer.sh

diff --git a/egs/aishell/asr1/conf/fbank.conf b/egs/aishell/asr1/conf/fbank.conf
index 82ac7bd0dbc..1ad20614eef 100644
--- a/egs/aishell/asr1/conf/fbank.conf
+++ b/egs/aishell/asr1/conf/fbank.conf
@@ -1,2 +1,2 @@
---sample-frequency=16000 
+--sample-frequency=8000
 --num-mel-bins=80
diff --git a/egs/aishell/asr1/conf/pitch.conf b/egs/aishell/asr1/conf/pitch.conf
index e959a19d5b8..926bcfca92a 100644
--- a/egs/aishell/asr1/conf/pitch.conf
+++ b/egs/aishell/asr1/conf/pitch.conf
@@ -1 +1 @@
---sample-frequency=16000
+--sample-frequency=8000
diff --git a/egs/aishell/asr1/infer.sh b/egs/aishell/asr1/infer.sh
new file mode 100755
index 00000000000..b9cbb023099
--- /dev/null
+++ b/egs/aishell/asr1/infer.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=0        # start from 0 if you need to start from data preparation
+stop_stage=100
+ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+train_config=conf/train.yaml
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume=         # specify a snapshot file to resume LM training
+lmtag=             # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+n_average=10
+
+# data
+#data=/export/a05/xna/data
+#data_url=www.openslr.org/resources/33
+
+data=/data/nipeng/2019-10-07-aishell
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_sp
+train_dev=dev
+#recog_set="dev test"
+recog_set="infer"
+
+# if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+#     echo "stage -1: Data Download"
+#     local/download_and_untar.sh ${data} ${data_url} data_aishell
+#     local/download_and_untar.sh ${data} ${data_url} resource_aishell
+# fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    local/aishell_data_prep_infer.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript
+    # remove space in text
+    for x in infer; do
+        cp data/${x}/text data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
+            > data/${x}/text
+        rm data/${x}/text.org
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    # steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+    #     data/train exp/make_fbank/train ${fbankdir}
+    # utils/fix_data_dir.sh data/train
+    # steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 --write_utt2num_frames true \
+    #     data/dev exp/make_fbank/dev ${fbankdir}
+    # utils/fix_data_dir.sh data/dev
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 --write_utt2num_frames true \
+        data/infer exp/make_fbank/infer ${fbankdir}
+    utils/fix_data_dir.sh data/infer
+
+    # # speed-perturbed
+    # utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1
+    # utils/perturb_data_dir_speed.sh 1.0 data/train data/temp2
+    # utils/perturb_data_dir_speed.sh 1.1 data/train data/temp3
+    # utils/combine_data.sh --extra-files utt2uniq data/${train_set} data/temp1 data/temp2 data/temp3
+    # rm -r data/temp1 data/temp2 data/temp3
+    # steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+    #     data/${train_set} exp/make_fbank/${train_set} ${fbankdir}
+    # utils/fix_data_dir.sh data/${train_set}
+
+    # # compute global CMVN
+    # compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # # dump features for training
+    # split_dir=$(echo $PWD | awk -F "/" '{print $NF "/" $(NF-1)}')
+    # if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
+    # utils/create_split_dir.pl \
+    #     /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_set}/delta${do_delta}/storage \
+    #     ${feat_tr_dir}/storage
+    # fi
+    # if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
+    # utils/create_split_dir.pl \
+    #     /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_dev}/delta${do_delta}/storage \
+    #     ${feat_dt_dir}/storage
+    # fi
+    # dump.sh --cmd "$train_cmd" --nj 32 --do_delta ${do_delta} \
+    #     data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj 10 --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_1char/${train_set}_units.txt
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1char/
+
+    # echo "make a dictionary"
+    # echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    # text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
+    # | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
+    # wc -l ${dict}
+
+    # echo "make json files"
+    # data2json.sh --feat ${feat_tr_dir}/feats.scp \
+	# 	 data/${train_set} ${dict} > ${feat_tr_dir}/data.json
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp \
+		     data/${rtask} ${dict} > ${feat_recog_dir}/data.json
+    done
+fi
+
+# you can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}
+lmexpdir=exp/${lmexpname}
+# mkdir -p ${lmexpdir}
+
+# if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+#     echo "stage 3: LM Preparation"
+#     lmdatadir=data/local/lm_train
+#     mkdir -p ${lmdatadir}
+#     text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
+#         > ${lmdatadir}/train.txt
+#     text2token.py -s 1 -n 1 data/${train_dev}/text | cut -f 2- -d" " \
+#         > ${lmdatadir}/valid.txt
+
+#     ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+#         lm_train.py \
+#         --config ${lm_config} \
+#         --ngpu ${ngpu} \
+#         --backend ${backend} \
+#         --verbose 1 \
+#         --outdir ${lmexpdir} \
+#         --tensorboard-dir tensorboard/${lmexpname} \
+#         --train-label ${lmdatadir}/train.txt \
+#         --valid-label ${lmdatadir}/valid.txt \
+#         --resume ${lm_resume} \
+#         --dict ${dict}
+# fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+# mkdir -p ${expdir}
+
+# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+#     echo "stage 4: Network Training"
+#     ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+#         asr_train.py \
+#         --config ${train_config} \
+#         --ngpu ${ngpu} \
+#         --backend ${backend} \
+#         --outdir ${expdir}/results \
+#         --tensorboard-dir tensorboard/${expname} \
+#         --debugmode ${debugmode} \
+#         --dict ${dict} \
+#         --debugdir ${expdir} \
+#         --minibatches ${N} \
+#         --verbose ${verbose} \
+#         --resume ${resume} \
+#         --train-json ${feat_tr_dir}/data.json \
+#         --valid-json ${feat_dt_dir}/data.json
+# fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    nj=40
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+	recog_model=model.last${n_average}.avg.best
+	# average_checkpoints.py --backend ${backend} \
+	# 		       --snapshots ${expdir}/results/snapshot.ep.* \
+	# 		       --out ${expdir}/results/${recog_model} \
+	# 		       --num ${n_average}
+    fi
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/rnnlm.model.best
+
+        score_sclite.sh ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/aishell/asr1/local/aishell_data_prep.sh b/egs/aishell/asr1/local/aishell_data_prep.sh
index 4747e4f4d82..700afa3dee9 100755
--- a/egs/aishell/asr1/local/aishell_data_prep.sh
+++ b/egs/aishell/asr1/local/aishell_data_prep.sh
@@ -12,7 +12,7 @@ if [ $# != 2 ]; then
 fi
 
 aishell_audio_dir=$1
-aishell_text=$2/aishell_transcript_v0.8.txt
+aishell_text=$2/aishell_transcript_v0.8.txt.0604.relabeled.0923.v2
 
 train_dir=data/local/train
 dev_dir=data/local/dev
diff --git a/egs/aishell/asr1/local/aishell_data_prep_infer.sh b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
new file mode 100755
index 00000000000..f4b75d102a7
--- /dev/null
+++ b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt.0604.relabeled.0923.v2
+
+#train_dir=data/local/train
+#dev_dir=data/local/dev
+#test_dir=data/local/test
+infer_dir=data/local/infer
+tmp_dir=data/local/tmp
+
+#mkdir -p $train_dir
+#mkdir -p $dev_dir
+#mkdir -p $test_dir
+mkdir -p $infer_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+# [ $n -ne 141925 ] && \
+#   echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/infer" $tmp_dir/wav.flist > $infer_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $infer_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/infer
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $infer_dir/$f data/infer/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/asr1/run.sh b/egs/aishell/asr1/run.sh
index d406715d5d2..45d750b3907 100755
--- a/egs/aishell/asr1/run.sh
+++ b/egs/aishell/asr1/run.sh
@@ -8,9 +8,10 @@
 
 # general configuration
 backend=pytorch
-stage=0        # start from 0 if you need to start from data preparation
-stop_stage=100
-ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
+stage=5        # start from 0 if you need to start from data preparation
+stop_stage=5
+export CUDA_VISIBLE_DEVICES=0,2,3
+ngpu=3         # number of gpus ("0" uses cpu, otherwise use gpu)
 debugmode=1
 dumpdir=dump   # directory to dump full features
 N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
@@ -33,8 +34,10 @@ recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.bes
 n_average=10
 
 # data
-data=/export/a05/xna/data
-data_url=www.openslr.org/resources/33
+#data=/export/a05/xna/data
+#data_url=www.openslr.org/resources/33
+
+data=/data/chenghengzhe/kf/dataset/fakes/fake_merged_v2
 
 # exp tag
 tag="" # tag for managing experiments.
@@ -51,11 +54,11 @@ train_set=train_sp
 train_dev=dev
 recog_set="dev test"
 
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    local/download_and_untar.sh ${data} ${data_url} data_aishell
-    local/download_and_untar.sh ${data} ${data_url} resource_aishell
-fi
+# if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+#     echo "stage -1: Data Download"
+#     local/download_and_untar.sh ${data} ${data_url} data_aishell
+#     local/download_and_untar.sh ${data} ${data_url} resource_aishell
+# fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
@@ -210,7 +213,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "stage 5: Decoding"
-    nj=32
+    nj=40
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
 	recog_model=model.last${n_average}.avg.best
 	average_checkpoints.py --backend ${backend} \
diff --git a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v1.yaml b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v1.yaml
index f3be22236ab..196449301f6 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v1.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v1.yaml
@@ -52,7 +52,7 @@ grad-clip: 1.0
 weight-decay: 0.0
 patience: 0
 epochs: 1000  # 1,000 epochs * 330 batches / 2 accum-grad = 165,000 iters
-teacher-model: exp/train_no_dev_pytorch_train_transformer.v1/results/model.last1.avg.best
+teacher-model: exp/train_no_dev_pytorch_train_pytorch_transformer.v1/results/model.last1.avg.best
                # you can download pretrained teacher model from google drive
                # https://drive.google.com/open?id=1arZAxZOLep-1W5ByQMD1lCX2tEASnw7p
 
diff --git a/egs/ljspeech/tts1/conf/tuning/train_pytorch_transformer.v1.yaml b/egs/ljspeech/tts1/conf/tuning/train_pytorch_transformer.v1.yaml
index 7c5512ad253..c7be653b50f 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_pytorch_transformer.v1.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_pytorch_transformer.v1.yaml
@@ -62,7 +62,7 @@ accum-grad: 2
 grad-clip: 1.0
 weight-decay: 0.0
 patience: 0
-epochs: 1000  # 1,000 epochs * 330 batches / 2 accum-grad = 165,000 iters
+epochs: 500  # 1,000 epochs * 330 batches / 2 accum-grad = 165,000 iters
 
 # other
 save-interval-epoch: 10
diff --git a/egs/ljspeech/tts1/local/text/cleaners.py b/egs/ljspeech/tts1/local/text/cleaners.py
index b99f6eefde1..7fb49245b5f 100644
--- a/egs/ljspeech/tts1/local/text/cleaners.py
+++ b/egs/ljspeech/tts1/local/text/cleaners.py
@@ -106,8 +106,8 @@ def english_cleaners(text):
     '''Pipeline for English text, including number and abbreviation expansion.'''
     text = convert_to_ascii(text)
     text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
+    #text = expand_numbers(text)
+    #text = expand_abbreviations(text)
     text = expand_symbols(text)
     text = remove_unnecessary_symbols(text)
     text = uppercase(text)
diff --git a/egs/ljspeech/tts1/run.sh b/egs/ljspeech/tts1/run.sh
index eaa2084fe57..5dfd68d6e85 100755
--- a/egs/ljspeech/tts1/run.sh
+++ b/egs/ljspeech/tts1/run.sh
@@ -8,9 +8,9 @@
 
 # general configuration
 backend=pytorch
-stage=-1
+stage=4
 stop_stage=100
-ngpu=1       # number of gpus ("0" uses cpu, otherwise use gpu)
+ngpu=4       # number of gpus ("0" uses cpu, otherwise use gpu)
 nj=32        # numebr of parallel jobs
 dumpdir=dump # directory to dump full features
 verbose=0    # verbose option (if set > 0, get more log)
@@ -28,9 +28,12 @@ n_shift=256   # number of shift points
 win_length="" # window length
 
 # config files
-train_config=conf/train_pytorch_tacotron2.yaml # you can select from conf or conf/tuning.
-                                               # now we support tacotron2, transformer, and fastspeech
-                                               # see more info in the header of each config.
+# you can select from conf or conf/tuning.
+# now we support tacotron2, transformer, and fastspeech
+# see more info in the header of each config.
+train_config=conf/tuning/train_pytorch_transformer.v1.yaml
+# train_config=conf/tuning/train_fastspeech.v1.yaml
+
 decode_config=conf/decode.yaml
 
 # decoding related
@@ -52,20 +55,21 @@ set -e
 set -u
 set -o pipefail
 
+corpus_path="/data/nipeng/TTS/data/fake_lj_nosil"
 train_set="train_no_dev"
 dev_set="dev"
 eval_set="eval"
 
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    local/download.sh ${db_root}
-fi
+# if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+#     echo "stage -1: Data Download"
+#     local/download.sh ${db_root}
+# fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 0: Data preparation"
-    local/data_prep.sh ${db_root}/LJSpeech-1.1 data/train
+    local/data_prep.sh ${corpus_path} data/train
     utils/validate_data_dir.sh --no-feats data/train
 fi
 
diff --git a/tools/Makefile b/tools/Makefile
index 8c88f218678..78129894166 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,15 +1,15 @@
 # If a Python interpreter is specified, then creates a virtualenv from it
 # PYTHON := /usr/bin/python3.7
-PYTHON :=
+PYTHON := /root/anaconda3/envs/np-py36-tts/bin/python3
 # The python version installed in the conda setup
 # NOTE(kan-bayashi): Use 3.7.3 to avoid sentencepiece installation error
-PYTHON_VERSION := 3.7.3
+PYTHON_VERSION := 3.6.8
 CUPY_VERSION := 6.0.0
-CUDA_VERSION := 10.0
+CUDA_VERSION := 9.0
 # PyTorch version: 0.4.1 or 1.0.0 or 1.0.1
-TH_VERSION := 1.0.1
+TH_VERSION := 1.0.0
 # Use a prebuild Kaldi to omit the installation
-KALDI :=
+KALDI := /data/nipeng/kaldi
 WGET := wget --tries=3
 
 # Both Miniconda2/3 can install any Python versions

From 0d784e384aa06b12e41180e8191493dffdbe4169 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Thu, 24 Oct 2019 12:33:36 +0800
Subject: [PATCH 02/23] update

---
 egs/aishell/asr1/infer.sh                   |  3 ++-
 egs/aishell/asr1/local/aishell_data_prep.sh |  2 +-
 egs/aishell/asr1/run.sh                     |  8 ++++----
 egs/aishell/asr1/test.py                    | 22 +++++++++++++++++++++
 4 files changed, 29 insertions(+), 6 deletions(-)
 create mode 100644 egs/aishell/asr1/test.py

diff --git a/egs/aishell/asr1/infer.sh b/egs/aishell/asr1/infer.sh
index b9cbb023099..731d21832d0 100755
--- a/egs/aishell/asr1/infer.sh
+++ b/egs/aishell/asr1/infer.sh
@@ -36,7 +36,8 @@ n_average=10
 #data=/export/a05/xna/data
 #data_url=www.openslr.org/resources/33
 
-data=/data/nipeng/2019-10-07-aishell
+# data=/data/nipeng/2019-10-07-aishell
+data=/data/nipeng/2019-1001-1020-aishell
 
 # exp tag
 tag="" # tag for managing experiments.
diff --git a/egs/aishell/asr1/local/aishell_data_prep.sh b/egs/aishell/asr1/local/aishell_data_prep.sh
index 700afa3dee9..a4e411f175b 100755
--- a/egs/aishell/asr1/local/aishell_data_prep.sh
+++ b/egs/aishell/asr1/local/aishell_data_prep.sh
@@ -12,7 +12,7 @@ if [ $# != 2 ]; then
 fi
 
 aishell_audio_dir=$1
-aishell_text=$2/aishell_transcript_v0.8.txt.0604.relabeled.0923.v2
+aishell_text=$2/aishell_transcript_v0.8.txt.add10w.20190918.tune_high_medium_lv.v2
 
 train_dir=data/local/train
 dev_dir=data/local/dev
diff --git a/egs/aishell/asr1/run.sh b/egs/aishell/asr1/run.sh
index 45d750b3907..77119cf5694 100755
--- a/egs/aishell/asr1/run.sh
+++ b/egs/aishell/asr1/run.sh
@@ -8,10 +8,10 @@
 
 # general configuration
 backend=pytorch
-stage=5        # start from 0 if you need to start from data preparation
-stop_stage=5
-export CUDA_VISIBLE_DEVICES=0,2,3
-ngpu=3         # number of gpus ("0" uses cpu, otherwise use gpu)
+stage=0        # start from 0 if you need to start from data preparation
+stop_stage=100
+# export CUDA_VISIBLE_DEVICES=0,2,3
+ngpu=4         # number of gpus ("0" uses cpu, otherwise use gpu)
 debugmode=1
 dumpdir=dump   # directory to dump full features
 N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
diff --git a/egs/aishell/asr1/test.py b/egs/aishell/asr1/test.py
new file mode 100644
index 00000000000..bfb1cc33981
--- /dev/null
+++ b/egs/aishell/asr1/test.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+
+import codecs
+
+
+
+def get_infer_result():
+    path = "/data/nipeng/TTS/espnet/egs/aishell/asr1/exp/train_sp_pytorch_train/decode_infer_decode_lm/hyp.trn"
+    fout = codecs.open("infer.txt", "w")
+    with codecs.open(path) as f:
+        for line in f:
+            t, a = line.split("(")
+            text = t.replace(" ", "")
+            aid = a.split("-")[0]
+            aname = aid + ".wav"
+            fout.write("%s\t%s\n" %(aname, text))
+
+
+if __name__ == "__main__":
+    print("ok")
+    get_infer_result()

From 878eaef9934bcf6e12e29c3132542a6e4a681503 Mon Sep 17 00:00:00 2001
From: root <root@gpu-001.qianjin.com>
Date: Thu, 24 Oct 2019 16:30:44 +0800
Subject: [PATCH 03/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9python,kaldi=E9=BB=98?=
 =?UTF-8?q?=E8=AE=A4=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index 78129894166..11173755732 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,6 +1,7 @@
 # If a Python interpreter is specified, then creates a virtualenv from it
 # PYTHON := /usr/bin/python3.7
-PYTHON := /root/anaconda3/envs/np-py36-tts/bin/python3
+# PYTHON := /root/anaconda3/envs/np-py36-tts/bin/python3
+PYTHON :=
 # The python version installed in the conda setup
 # NOTE(kan-bayashi): Use 3.7.3 to avoid sentencepiece installation error
 PYTHON_VERSION := 3.6.8
@@ -9,7 +10,8 @@ CUDA_VERSION := 9.0
 # PyTorch version: 0.4.1 or 1.0.0 or 1.0.1
 TH_VERSION := 1.0.0
 # Use a prebuild Kaldi to omit the installation
-KALDI := /data/nipeng/kaldi
+# KALDI := /data/nipeng/kaldi
+KALDI :=
 WGET := wget --tries=3
 
 # Both Miniconda2/3 can install any Python versions

From f44d6fdc2590803ab5bef15cc95f9703d07fd28c Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Thu, 24 Oct 2019 22:25:43 +0800
Subject: [PATCH 04/23] add: conda config channels

---
 tools/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index 11173755732..22951a259c2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -63,6 +63,9 @@ miniconda.sh:
 	test -f miniconda.sh || $(WGET) $(CONDA_URL) -O miniconda.sh
 venv: miniconda.sh
 	test -d $(PWD)/venv || bash miniconda.sh -b -p $(PWD)/venv
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
 	. venv/bin/activate && conda update -y conda
 	. venv/bin/activate && conda install -y python=$(PYTHON_VERSION)
 	. venv/bin/activate && conda info -a

From 964803fc52636922e8c1be4dd145a9f44cec13f4 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Fri, 25 Oct 2019 09:16:09 +0800
Subject: [PATCH 05/23] add: makefile

---
 tools/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 22951a259c2..44f5552cf3c 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -63,10 +63,11 @@ miniconda.sh:
 	test -f miniconda.sh || $(WGET) $(CONDA_URL) -O miniconda.sh
 venv: miniconda.sh
 	test -d $(PWD)/venv || bash miniconda.sh -b -p $(PWD)/venv
+	. venv/bin/activate && conda update -y conda
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
-	. venv/bin/activate && conda update -y conda
+	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y python=$(PYTHON_VERSION)
 	. venv/bin/activate && conda info -a
 espnet.done: venv

From d9262d8d109516f1947ec32bf0bdf8a11dfba066 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Fri, 25 Oct 2019 09:38:55 +0800
Subject: [PATCH 06/23] add: pip index

---
 tools/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/Makefile b/tools/Makefile
index 44f5552cf3c..61c4b850a63 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -56,6 +56,7 @@ venv:
 espnet.done: venv
 	. venv/bin/activate; pip install pip --upgrade
 	. venv/bin/activate; pip install -e ..
+	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
 	. venv/bin/activate; pip install torch==$(TH_VERSION)
 	touch espnet.done
 else

From 9634530d2e853e775e045add82d28029fdf23fa2 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Fri, 25 Oct 2019 11:23:22 +0800
Subject: [PATCH 07/23] add: pip

---
 tools/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/Makefile b/tools/Makefile
index 61c4b850a63..d75297666bd 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -54,6 +54,7 @@ ifneq ($(strip $(PYTHON)),)
 venv:
 	test -d venv || virtualenv -p $(PYTHON) venv
 espnet.done: venv
+	. venv/bin/activate; pip install -U pip
 	. venv/bin/activate; pip install pip --upgrade
 	. venv/bin/activate; pip install -e ..
 	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/

From b25af285cf702dfe2b2661a127dc470f65ee4f8b Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Fri, 25 Oct 2019 14:52:27 +0800
Subject: [PATCH 08/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9python=E7=89=88?=
 =?UTF-8?q?=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index d75297666bd..6810c9a2642 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -4,7 +4,7 @@
 PYTHON :=
 # The python version installed in the conda setup
 # NOTE(kan-bayashi): Use 3.7.3 to avoid sentencepiece installation error
-PYTHON_VERSION := 3.6.8
+PYTHON_VERSION := 3.7.3
 CUPY_VERSION := 6.0.0
 CUDA_VERSION := 9.0
 # PyTorch version: 0.4.1 or 1.0.0 or 1.0.1

From e669ea4015b1f57d104b2e3736314b093b48a07b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E8=92=99?= <limeng@finupgroup.com>
Date: Fri, 25 Oct 2019 16:59:59 +0800
Subject: [PATCH 09/23] pip

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 6810c9a2642..c3250242d57 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -58,7 +58,7 @@ espnet.done: venv
 	. venv/bin/activate; pip install pip --upgrade
 	. venv/bin/activate; pip install -e ..
 	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
-	. venv/bin/activate; pip install torch==$(TH_VERSION)
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ torch==$(TH_VERSION)
 	touch espnet.done
 else
 miniconda.sh:

From f1c55f12c074426d330757b4f16bbf0fbd3745b6 Mon Sep 17 00:00:00 2001
From: nipeng <limeng@finupgroup.com>
Date: Fri, 25 Oct 2019 17:27:14 +0800
Subject: [PATCH 10/23] add: conda channels

---
 tools/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index c3250242d57..d3c6a38ede3 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -73,6 +73,10 @@ venv: miniconda.sh
 	. venv/bin/activate && conda install -y python=$(PYTHON_VERSION)
 	. venv/bin/activate && conda info -a
 espnet.done: venv
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
+	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) -c pytorch
 	. venv/bin/activate && pip install -e ..
 	touch espnet.done

From 19d3e85d00e54cb5e7df6a9a1bdc0942a719161a Mon Sep 17 00:00:00 2001
From: nipeng <limeng@finupgroup.com>
Date: Sat, 26 Oct 2019 09:29:29 +0800
Subject: [PATCH 11/23] =?UTF-8?q?=E5=A2=9E=E5=8A=A0infer?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker/infer.sh                               | 163 ++++++++++++++++++
 .../asr1/local/aishell_data_prep_infer.sh     |   1 +
 2 files changed, 164 insertions(+)
 create mode 100755 docker/infer.sh

diff --git a/docker/infer.sh b/docker/infer.sh
new file mode 100755
index 00000000000..e7b6ec914a1
--- /dev/null
+++ b/docker/infer.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+docker_gpu=0
+docker_egs=
+docker_folders=
+docker_cuda=10.0
+docker_user=true
+docker_env=
+docker_cmd=
+docker_os=u18
+
+while test $# -gt 0
+do
+    case "$1" in
+        -h) echo "Usage: `basename $0` [-h] docker_gpu docker_egs docker_folders options"
+            exit 0;;
+        --help) echo "Usage: `basename $0` [-h] ] docker_gpu docker_egs docker_folders options"
+              exit 0;;
+        --docker*) ext=${1#--}
+              frombreak=true
+              for i in _ {a..z} {A..Z}; do
+                for var in `eval echo "\\${!${i}@}"`; do
+                  if [ "$var" == "$ext" ]; then
+                    eval ${ext}=$2
+                    frombreak=false
+                    break 2
+                  fi 
+                done 
+              done
+              if ${frombreak} ; then
+                echo "bad option $1" 
+                exit 1
+              fi
+              ;;
+        --*) break
+              ;;
+    esac
+    shift
+    shift
+done
+
+if [ -z "${docker_egs}" ]; then
+  echo "Select an example to work with from the egs folder."
+  exit 1
+fi
+
+from_tag="cpu"
+if [ ! "${docker_gpu}" == "-1" ]; then
+  if [ -z "${docker_cuda}" ]; then
+    # If the docker_cuda is not set, the program will automatically 
+    # search the installed version with default configurations (apt)
+    docker_cuda=$( nvcc -V | grep release )
+    docker_cuda=${docker_cuda#*"release "}
+    docker_cuda=${docker_cuda%,*}
+  fi
+  # After search for your cuda version, if the variable docker_cuda is empty the program will raise an error
+  if [ -z "${docker_cuda}" ]; then
+    echo "CUDA was not found in your system. Use CPU image or install NVIDIA-DOCKER, CUDA and NVCC for GPU image."
+    exit 1
+  else
+    from_tag="gpu-cuda${docker_cuda}-cudnn7"
+  fi
+fi
+
+if [ ! -z "${docker_os}" ]; then
+  from_tag="${from_tag}-${docker_os}"
+fi
+
+# Check if image exists in the system and download if required
+docker_image=$( docker images -q espnet/espnet:${from_tag} )
+if ! [[ -n ${docker_image}  ]]; then
+  docker pull espnet/espnet:${from_tag}
+fi
+
+if [ ${docker_user} = true ]; then
+  # Build a container with the user account
+  container_tag="${from_tag}-user-${HOME##*/}"
+  docker_image=$( docker images -q espnet/espnet:${container_tag} ) 
+  if ! [[ -n ${docker_image}  ]]; then
+    echo "Building docker image..."
+    build_args="--build-arg FROM_TAG=${from_tag}"
+    build_args="${build_args} --build-arg THIS_USER=${HOME##*/}"
+    build_args="${build_args} --build-arg THIS_UID=${UID}"
+
+    echo "Now running docker build ${build_args} -f prebuilt/Dockerfile -t espnet/espnet:${container_tag} ."
+    (docker build ${build_args} -f prebuilt/Dockerfile -t  espnet/espnet:${container_tag} .) || exit 1
+  fi
+else
+  container_tag=${from_tag}
+fi
+
+echo "Using image espnet/espnet:${container_tag}."
+
+this_time="$(date '+%Y%m%dT%H%M')"
+if [ "${docker_gpu}" == "-1" ]; then
+  cmd0="docker"
+  container_name="espnet_cpu_${this_time}"
+else
+  # --rm erase the container when the training is finished.
+  cmd0="NV_GPU='${docker_gpu}' nvidia-docker"
+  container_name="espnet_gpu${docker_gpu//,/_}_${this_time}"
+fi
+
+cd ..
+
+vols="-v /data/mfs:/data -v ${PWD}/egs:/espnet/egs -v ${PWD}/espnet:/espnet/espnet -v ${PWD}/test:/espnet/test -v ${PWD}/utils:/espnet/utils"
+if [ ! -z "${docker_folders}" ]; then
+  docker_folders=$(echo ${docker_folders} | tr "," "\n")
+  for i in ${docker_folders[@]}
+  do
+    vols=${vols}" -v $i:$i";
+  done
+fi
+
+cmd1="cd /espnet/egs/${docker_egs}"
+if [ ! -z "${docker_cmd}" ]; then
+  cmd2="./${docker_cmd} $@"
+else
+  cmd2="./run.sh $@"
+fi
+
+if [ ${docker_user} = false ]; then
+  # Required to access to the folder once the training if finished in root access
+  cmd2="${cmd2}; chmod -R 777 /espnet/egs/${docker_egs}"
+fi
+
+cmd="${cmd1}; ${cmd2}"
+this_env=""
+if [ ! -z "${docker_env}" ]; then
+  docker_env=$(echo ${docker_env} | tr "," "\n")
+  for i in ${docker_env[@]}
+  do
+    this_env="-e $i ${this_env}" 
+  done
+fi
+
+if [ ! -z "${HTTP_PROXY}" ]; then
+  this_env="${this_env} -e 'HTTP_PROXY=${HTTP_PROXY}'"
+fi
+
+if [ ! -z "${http_proxy}" ]; then
+  this_env="${this_env} -e 'http_proxy=${http_proxy}'"
+fi
+
+cmd="${cmd0} run -i -v /data/mfs:/data/mfs  --rm ${this_env} --name ${container_name} ${vols} espnet/espnet:${container_tag} /bin/bash -c '${cmd}'"
+
+
+trap ctrl_c INT
+
+function ctrl_c() {
+        echo "** Kill docker container ${container_name}"
+        docker rm -f ${container_name}
+}
+
+echo "Executing application in Docker"
+echo ${cmd}
+eval ${cmd} &
+PROC_ID=$!
+
+while kill -0 "$PROC_ID" 2> /dev/null; do
+    sleep 1
+done
+echo "`basename $0` done."
diff --git a/egs/aishell/asr1/local/aishell_data_prep_infer.sh b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
index f4b75d102a7..c598ed5fc13 100755
--- a/egs/aishell/asr1/local/aishell_data_prep_infer.sh
+++ b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
@@ -29,6 +29,7 @@ mkdir -p $tmp_dir
 # data directory check
 if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
   echo "Error: $0 requires two directory arguments"
+  echo ${aishell_audio_dir}, ${aishell_text}
   exit 1;
 fi
 

From 0c3940894cc049119b3362135db7234d7a3c9fd0 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Mon, 28 Oct 2019 18:26:36 +0800
Subject: [PATCH 12/23] change trans file name

---
 egs/aishell/asr1/asr_pipline.py               | 453 ++++++++++++++++++
 egs/aishell/asr1/asr_pipline.sh               |  34 ++
 .../asr1/local/aishell_data_prep_infer.sh     |   2 +-
 3 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 egs/aishell/asr1/asr_pipline.py
 create mode 100644 egs/aishell/asr1/asr_pipline.sh

diff --git a/egs/aishell/asr1/asr_pipline.py b/egs/aishell/asr1/asr_pipline.py
new file mode 100644
index 00000000000..50bb04b88e4
--- /dev/null
+++ b/egs/aishell/asr1/asr_pipline.py
@@ -0,0 +1,453 @@
+# -*- coding: utf-8 -*-
+
+import sys
+
+if sys.version[0] == '2':
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+
+import os
+import time
+import json
+import redis
+import jieba
+import codecs
+import shutil
+import hashlib
+import tempfile
+import argparse
+import subprocess
+from collections import defaultdict
+from functools import partial
+from concurrent.futures import ProcessPoolExecutor
+from multiprocessing import cpu_count
+from pydub import AudioSegment
+from kafka import KafkaConsumer, KafkaProducer
+from kafka.errors import CommitFailedError
+from elasticsearch import Elasticsearch, helpers
+
+
+def norm_aishell_data(indir):
+    # /tmp/asr_9hmlx3zl
+    outdir = indir + "_norm"
+    for path in os.listdir(indir):
+        fname = path.split(".")[0]
+        os.system("mkdir -p %s/data_aishell/wav/infer/%s" %(outdir, fname))
+        cmd = "cp %s/%s %s/data_aishell/wav/infer/%s/." %(indir, path, outdir, fname)
+        os.system("cp %s/%s %s/data_aishell/wav/infer/%s/." %(indir, path, outdir, fname))
+
+    os.system("mkdir -p %s/data_aishell/transcript" %outdir)
+    outfile = "%s/data_aishell/transcript/aishell_transcript.txt" %outdir
+    fout = codecs.open(outfile, "w")
+    for path in os.listdir(indir):
+        fname = path.split(".")[0]
+        fout.write("%s %s\n" %(fname, "哈哈"))
+    return outdir
+
+
+def gen_sp_wav_and_get_path_one(wav_temppath, audio_id, sound, item, k):
+    cut_wav_name = "%s_%s_%s_%s.wav" % (audio_id, item['start'], item['end'], k)
+    save_cut_path = os.path.join(wav_temppath, cut_wav_name)
+    sp_wav = sound[int(item['start']):int(item['end'])]
+    if sp_wav.frame_rate != 8000:
+        sp_wav = sp_wav.set_frame_rate(8000)
+    sp_wav.export(save_cut_path, format="wav")
+    print("==3", save_cut_path)
+    return save_cut_path
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='语音识别主函数参数')
+    parser.add_argument("-kh", "--kafka-host",
+                        default="192.168.40.22:9090,192.168.40.19:9090,192.168.40.59:9090,192.168.40.60:9090,192.168.40.61:9090",
+                        required=True,
+                        help="kafka host:port 集群")
+
+    parser.add_argument("-sct", "--seg-consumer-topics",
+                        default="sp_vad_topic1",
+                        help="输入kafka的topic")
+
+    parser.add_argument("-ikg", "--seg-consumer-groupid",
+                        default="asr_group1",
+                        help="消费kafka的groupid")
+
+    parser.add_argument("-stm", "--session-timeout-ms",
+                        default=60000,
+                        type=int,
+                        help="消费kafka的心跳超时时间")
+
+    parser.add_argument("-saor", "--seg-auto-offset-reset",
+                        default="largest",
+                        help="重置偏移量,earliest移到最早的可用消息,latest最新的消息, 默认为largest,即是latest.消费者消费类型参数:{'smallest': 'earliest', 'largest': 'latest'}")
+
+    parser.add_argument("-apt", "--asr-producer-topics",
+                        default="asr_topic1",
+                        help="输入kafka的topic")
+
+    parser.add_argument("-fp", "--father-path",
+                        default="/data/mfs/k8s/speech_pipeline/sad",
+                        help="切分来源音频存放父目录")
+
+    parser.add_argument("-wp", "--hkust-path",
+                        default="/home/app/asr_pipline/kaldi/egs/hkust/s5_iqianjin",
+                        help="hkust的绝对目录,即kaldi的hkust目录")
+
+    parser.add_argument("-nj", "--num-job",
+                        type=int, default=10,
+                        help="hkust num job default 10")
+
+    parser.add_argument("-nw", "--num-workers",
+                        type=int, default=18,
+                        help="multiprocess number of workers")
+
+    parser.add_argument("-cg", "--consumer-gap",
+                        type=int, default=2,
+                        help="kafka consumer msg num")
+
+    parser.add_argument("-ptm", "--poll-timeout-ms",
+                        type=int, default=60000,
+                        help="")
+
+    parser.add_argument('--is-comp', help='1为补数,0为不进入补数筛选逻辑', default=0, type=int)
+    args = parser.parse_args()
+    return args
+
+
+class ASR(object):
+    def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
+                 session_timeout_ms=60000, seg_auto_offset_reset="largest",
+                 asr_producer_topics="asr_topic1", num_job=10, is_comp=0,
+                 poll_timeout_ms=60000, consumer_gap=None, num_workers=cpu_count()):
+        """
+        :param kafka_servers: kafka host:port
+        :param seg_consumer_topics: 切分的消费者topic
+        :param seg_consumer_groupid: 消费者group id
+        :param session_timeout_ms: 心跳超时时间
+        :param seg_auto_offset_reset: 重置偏移量, earliest移到最早的可用消息, latest最新的消息, 默认为largest,即是latest.
+                                  源码定义: {'smallest': 'earliest', 'largest': 'latest'}
+        :param asr_producer_topics: 语音是被的生产者topic,默认值:asr_topic1
+        :param num_job: 语音识别的线程数
+        :param is_comp: 是否为补数逻辑,是的话会进入时间判断和es数据判断
+        """
+        self.kafka_servers = kafka_servers
+        self.seg_consumer_groupid = seg_consumer_groupid
+        self.session_timeout_ms = session_timeout_ms
+        self.seg_auto_offset_reset = seg_auto_offset_reset
+        self.seg_consumer_topics = seg_consumer_topics
+        self.num_job = num_job
+        self.poll_timeout_ms = poll_timeout_ms
+        self.consumer_gap = consumer_gap
+        self.num_workers = num_workers
+
+        self._get_from_client()
+
+        # 语音识别结果的kafka生产者
+        self.to_client = KafkaProducer(bootstrap_servers=kafka_servers,  # kafka host:port
+                                       compression_type="gzip",
+                                       max_request_size=1024 * 1024 * 20)
+        self.asr_producer_topics = asr_producer_topics  # ASR生产者topic
+
+        self.is_comp = is_comp
+        if is_comp:
+            self.comp_num = 0
+            self.prop = {"HOST1": "192.168.40.37",
+                         "HOST2": "192.168.40.38",
+                         "HOST3": "192.168.40.39",
+                         "PORT": "9200",
+                         "DOC_TYPE": "kf_infobird_call"}
+
+            self.es = Elasticsearch(hosts=[{'host': self.prop['HOST1'], 'port': self.prop['PORT']},
+                                           {'host': self.prop['HOST2'], 'port': self.prop['PORT']},
+                                           {'host': self.prop['HOST3'], 'port': self.prop['PORT']}])
+
+    def _get_from_client(self):
+        # 消费者切分好的音频kafka消费者
+        self.from_client = KafkaConsumer(bootstrap_servers=self.kafka_servers,  # kafka host:port
+                                         group_id=self.seg_consumer_groupid,  # 消费者group id
+                                         session_timeout_ms=self.session_timeout_ms,  # 设置心跳超时时间
+                                         enable_auto_commit=False,  # 是否自动提交
+                                         auto_offset_reset=self.seg_auto_offset_reset)  # 消费重置偏移量
+        self.from_client.subscribe(self.seg_consumer_topics)  # 切分的消费者topic
+
+    def asr_pipline_from_kafka(self, father_path):
+        """
+        获取kafka的数据流，并进行识别，合并，标点，存入es
+        :param father_path: 切分来源音频存放父目录
+        :return:
+        """
+        # redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
+        while True:
+            if not self.from_client:
+                self._get_from_client()
+
+            tp_msgs = self.from_client.poll(timeout_ms=self.poll_timeout_ms,
+                                            max_records=self.consumer_gap)
+            msgs = []
+            for tp, _msgs in tp_msgs.items():
+                msgs.extend(_msgs)
+            print(len(msgs))
+            self.batch_asr_pipline(father_path, msgs)
+            break
+
+    def batch_asr_pipline(self, father_path, msgs):
+        """
+        单个kafka消息消费:
+        1)从kafka消息中提取value信息;
+        2)根据msg.value信息从切分好的数据源复制到目标位置;
+        3)进行语音识别,放进merge_dict里;
+        4)写入kafka新的topic;
+        7)删除临时音频的文件夹和语音识别结果的临时文件.
+        :param father_path: 切分来源音频存放父目录;
+        :param msg: 从kafka获取的完整消息;
+        :return:
+        """
+        flag = False  # flag 是否语音识别成功并存入kafka
+        wav_temppath = tempfile.mkdtemp(prefix="asr_")
+        batch_wav_lst = []
+        batch_voice_data = {}
+        batch_merge_dict = None
+
+#        try:
+        redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
+        for msg in msgs:
+            if msg is not None:
+                audio_id = json.loads(msg.value).get('audio_id', '')
+                print("==2", audio_id)
+                if not redis_client.get('asr_espnet_' + str(audio_id)):
+                    # step1: 从kafka消息中提取value信息
+                    voice_data = json.loads(msg.value)
+                    voice_data['start_asr'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    batch_voice_data[audio_id] = voice_data  # 后面继续往 voice_data 追加数据
+        # step2：根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
+        batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data)
+        # step3: 语音识别, 并获取merge的text
+        # batch results
+        print("==4", wav_temppath)
+        wav_normpath = norm_aishell_data(wav_temppath)
+        merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
+        #batch_merge_dict = self.split_merge_dict(merge_dict)
+#        except Exception as e:
+#            print("bebefore commit to kafka error log: %s, msg: %s" % (e, ""))
+#        finally:
+#            # step7: 删除临时音频的文件夹和语音识别结果的临时文件
+#            #shutil.rmtree(wav_temppath)
+#            pass
+        return
+
+        for audio_id, voice_data in batch_voice_data.items():
+            # step4: 写入kafka新的topic
+            try:
+                voice_data["merge_dict"] = batch_merge_dict[audio_id]
+                voice_data['end_asr'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                flag = self._kafka_producers(voice_data) if voice_data.get("merge_dict", {}) else False
+                voice_data['step4_end_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                print("asr_output:", json.dumps(voice_data, ensure_ascii=True))
+            except Exception as e:
+                print("before commit to kafka error log: %s, msg: %s" % (e, ""))
+        try:
+            self.from_client.commit()
+        except CommitFailedError:
+            print('commit error!')
+            for audio_id, _ in batch_voice_data.items():
+                redis_client.set('asr_espnet_' + str(audio_id), 1, ex=24 * 3600)
+
+        return flag
+
+    def split_merge_dict(self, merge_dict):
+        """
+        拆分merge_dict
+        :param merge_dict: {"A_00*_**.wav": "123",
+                            "A_**_***.wav": "321",
+                            "B_**_***.wav": "423"}
+        """
+        split_dict = defaultdict(dict)
+        for wav_file, text in merge_dict.items():
+            audio_id = wav_file.split("_")[0]
+            split_dict[audio_id][wav_file] = text
+        return split_dict
+
+    def filter_conditions(self, voice_data):
+        """
+        筛选出不需要补数的数据,并抛出异常;
+        :param voice_data:
+        :return:
+        """
+        end_sad = voice_data.get("end_sad", "0000-00-00 00:00:00")
+        audio_id = voice_data.get("audio_id", None)
+        body = {
+            "query": {"bool": {"must": [{"term": {"audio_id.keyword": audio_id}}], "must_not": [], "should": []}},
+            "from": 0, "size": 10, "sort": [], "aggs": {}}
+        if end_sad < "2019-07-11 12:00:00" or end_sad > "2019-07-11 21:00:00":
+            raise RuntimeError('This one\'s end_sad not in time window !')
+        query = self.es.search(index="asr_text",
+                               doc_type=self.prop['DOC_TYPE'],
+                               scroll='5m',
+                               timeout='30s',
+                               body=body)
+        results = query['hits']['hits']  # es查询出的结果第一页
+        total = query['hits']['total']  # es查询出的结果总量
+        print("audio_id:", audio_id, "search num:", total, ",end_sad", end_sad)
+        if total > 0:
+            line = results[0]["_source"]
+            if line.get("cut_text"):
+                raise RuntimeError('This id already exists and already asr !')
+        else:
+            segments = voice_data.get("segments", {})
+            for k, v in segments.items():
+                if k in ["gk", "kf"] and len(v) > 0:
+                    break
+            else:
+                raise RuntimeError('This data segments is null !')
+        self.comp_num += 1
+
+    def _copy_wav(self, father_path, voice_data, temp_path):
+        """
+        step2: 根据msg信息从切分好的数据源复制到目标位置
+        :param father_path: 切分来源音频存放父目录
+        :param voice_data: kafka来源信息中的包含切分音频的信息
+        :param temp_path: 音频复制目标目录
+        :return: wav_lst:
+        """
+        source = voice_data.get("source", "")
+        date = voice_data.get("date", "")
+        audio_id = voice_data.get("audio_id", "")
+
+        value_dict = voice_data.get("segments", {})
+        wav_lst = []  # 音频list
+
+        for chn, split_time in value_dict.items():
+            for si in split_time:
+                split_path = "%s_%s_%s_%s.wav" % (audio_id, si.get("start"), si.get("end"), chn)
+                wav_path = os.path.join(father_path, source, date, audio_id, split_path)
+                shutil.copy(wav_path, temp_path)
+                wav_lst.append(wav_path)
+        return wav_lst
+
+    def gen_sp_wav_and_get_path(self, father_path, wav_temppath, voice_data):
+        """
+        step2: 根据msg信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
+        :param father_path: 音频来源目录,结果存储
+        :param wav_temppath: kafka来源信息中的包含切分音频的信息
+        :return: wav_lst:
+        """
+        wav_lst = []  # wav 音频存储
+        source = voice_data["source"]
+        code = None if source != 'infobird' else 'pcm_s16le'
+        date = voice_data["date"]
+        audio_id = voice_data["audio_id"]
+        wav_father_path = os.path.join(father_path, source, date)  # /data/mfs/k8s/speech_pipeline/raw/{source}/{date}
+        for k, v in voice_data['segments'].items():
+            if k in ["gk", "kf"] and len(v) > 0:
+                wav_name = "%s_%s.wav" % (audio_id, k)
+                raw_wav_path = os.path.join(wav_father_path, wav_name)
+                sound = AudioSegment.from_file(raw_wav_path, codec=code, format="wav")
+                for item in v:
+                    cut_wav_name = "%s_%s_%s_%s.wav" % (audio_id, item['start'], item['end'], k)
+                    save_cut_path = os.path.join(wav_temppath, cut_wav_name)
+                    sp_wav = sound[int(item['start']):int(item['end'])]
+                    if sp_wav.frame_rate != 8000:
+                        sp_wav = sp_wav.set_frame_rate(8000)
+                    sp_wav.export(save_cut_path, format="wav")
+                    wav_lst.append(save_cut_path)
+        return wav_lst
+
+    def gen_sp_wav_and_get_path_mp(self, father_path, wav_temppath, batch_voice_data):
+        """
+        multiprocess generator
+        step2-1: 根据msg信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
+        :param father_path: 音频来源目录,结果存储
+        :param wav_temppath: kafka来源信息中的包含切分音频的信息
+        :return: wav_lst:
+        """
+        p = ProcessPoolExecutor(max_workers=self.num_workers)  #不填则默认为cpu的个数
+        result = []
+        wav_lst = []  # wav 音频存储
+        for _, voice_data in batch_voice_data.items():
+            source = voice_data["source"]
+            code = None if source != 'infobird' else 'pcm_s16le'
+            date = voice_data["date"]
+            audio_id = voice_data["audio_id"]
+            wav_father_path = os.path.join(father_path, source, date)  # /data/mfs/k8s/speech_pipeline/raw/{source}/{date}
+            for k, v in voice_data['segments'].items():
+                if k in ["gk", "kf"] and len(v) > 0:
+                    wav_name = "%s_%s.wav" % (audio_id, k)
+                    raw_wav_path = os.path.join(wav_father_path, wav_name)
+                    sound = AudioSegment.from_file(raw_wav_path, codec=code, format="wav")
+                    for item in v:
+                        # print(item, type(item))
+                        obj = p.submit(partial(gen_sp_wav_and_get_path_one, wav_temppath, audio_id, sound, item, k))
+                        result.append(obj)
+        p.shutdown()
+        res = [obj.result() for obj in result]
+        return res
+
+    def _asr_cmd(self, wav_path):
+        """
+        step3: 语音识别, 并获取merge的text
+        :param hkust_path: hkust的绝对目录,即kaldi的hkust目录
+        :param wav_path: 需识别音频存放目录
+        :return:
+        """
+        decode_cmd = "./infer.sh {}".format(wav_path)
+        print(decode_cmd)
+        os.system(decode_cmd)
+        ## return merge_dict
+
+    def _kafka_producers(self, voice_data):
+        """
+        step6: 将语音识别好的结果存入kafka中
+        :param voice_data: 来源kafka音频结果与语音识别结果结合体,从中获取相关信息，包含识别结果
+        :return:
+        """
+        flag = False
+        try:
+            audio_id = voice_data.get("audio_id", "")
+            asr_model = voice_data.get("asr_model", "")
+            k = self._create_id_by_input(audio_id + asr_model).encode("utf8")
+            v = json.dumps(voice_data).encode("utf8")
+            self.to_client.send(topic=self.asr_producer_topics, key=k, value=v)
+            flag = True
+        except Exception as e:
+            # logger.error("error: %s, voice_data: %s" % (e, voice_data))
+            print("error: %s, voice_data: %" % (e, json.dumps(voice_data, ensure_ascii=True)))
+        finally:
+            return flag
+
+    def _create_id_by_input(self, id=""):
+        """
+        依据输入与时间生成一个唯一的md5的id
+        :param x:
+        :return:
+        """
+        x = str(id).encode("utf8")
+        m = hashlib.md5(x)
+        return m.hexdigest()
+
+
+if __name__ == '__main__':
+    args = get_parser()
+    kafka_host, session_timeout_ms = args.kafka_host, args.session_timeout_ms
+    seg_consumer_topics, seg_auto_offset_reset = args.seg_consumer_topics, args.seg_auto_offset_reset  # 消费切分结果的消费者参数
+    seg_consumer_groupid, asr_producer_topics = args.seg_consumer_groupid, args.asr_producer_topics  # 语音识别生产者参数
+    father_path, hkust_path = args.father_path, args.hkust_path  # 音频与模型相关参数
+    num_job = args.num_job  # 线程数
+    is_comp = args.is_comp  # 是否走补数过滤流程
+    poll_timeout_ms = args.poll_timeout_ms
+    consumer_gap = args.consumer_gap
+    num_workers = args.num_workers
+
+    # python2 asr_pipline.py --kafka-host 192.168.40.22:9090,192.168.40.19:9090,192.168.40.59:9090,192.168.40.60:9090,192.168.40.61:9090 \
+    # --seg_consumer_topics sp_vad_topic1 \
+    # --seg_consumer_groupid asr_group1 \
+    # --seg-auto-offset-reset largest \
+    # --father-path /data/mfs/k8s/speech_pipeline/sad \
+    # --hkust-path /home/app/asr_pipline/kaldi/egs/hkust/s5_iqianjin
+
+    if type(seg_consumer_topics) is str:
+        seg_consumer_topics = [seg_consumer_topics, ]
+    asr = ASR(kafka_servers=kafka_host, seg_consumer_topics=seg_consumer_topics, session_timeout_ms=session_timeout_ms,
+              seg_consumer_groupid=seg_consumer_groupid, seg_auto_offset_reset=seg_auto_offset_reset,
+              asr_producer_topics=asr_producer_topics, num_job=num_job, is_comp=is_comp,
+              poll_timeout_ms=poll_timeout_ms, consumer_gap=consumer_gap, num_workers=num_workers)
+
+    asr.asr_pipline_from_kafka(father_path)
diff --git a/egs/aishell/asr1/asr_pipline.sh b/egs/aishell/asr1/asr_pipline.sh
new file mode 100644
index 00000000000..52af73ac321
--- /dev/null
+++ b/egs/aishell/asr1/asr_pipline.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/bash
+#vim /etc/hosts
+## Entries added by HostAliases.
+#192.168.176.181	daasoffline1.kafka.dc.puhuifinance.com
+#192.168.176.182	daasoffline2.kafka.dc.puhuifinance.com
+#192.168.176.183	daasoffline3.kafka.dc.puhuifinance.com
+#192.168.176.184	daasoffline4.kafka.dc.puhuifinance.com
+#192.168.176.185	daasoffline5.kafka.dc.puhuifinance.com
+
+# test
+# 测试需要在代码内修改生产则kafka host
+python3 asr_pipline.py \
+  --kafka-host daasoffline1.kafka.dc.puhuifinance.com:6667,daasoffline2.kafka.dc.puhuifinance.com:6667,daasoffline3.kafka.dc.puhuifinance.com:6667,daasoffline4.kafka.dc.puhuifinance.com:6667,daasoffline5.kafka.dc.puhuifinance.com:6667 \
+  --seg-consumer-topics sp_sad_topic \
+  --seg-consumer-groupid sp_sad_asr_group_np_20191028_v3 \
+  --session-timeout-ms 60000 \
+  --seg-auto-offset-reset smallest \
+  --asr-producer-topics asr_topic1_t1 \
+  --father-path /data/mfs/k8s/speech_pipeline/raw \
+  --hkust-path /home/app/hkust/kaldi/egs/hkust/s5_daihou \
+  --num-job 40
+#  --is-comp 1
+
+
+## online
+#python2 -u asr_pipline.py --kafka-host daasoffline1.kafka.dc.puhuifinance.com:6667,daasoffline2.kafka.dc.puhuifinance.com:6667,daasoffline3.kafka.dc.puhuifinance.com:6667,daasoffline4.kafka.dc.puhuifinance.com:6667,daasoffline5.kafka.dc.puhuifinance.com:6667 \
+#  --seg-consumer-topics sp_sad_topic \
+#  --seg-consumer-groupid sp_sad_asr_group \
+#  --session-timeout-ms 60000 \
+#  --seg-auto-offset-reset largest \
+#  --asr-producer-topics sp_asr_topic \
+#  --father-path /data/mfs/k8s/speech_pipeline/sad \
+#  --hkust-path /home/app/asr_pipline/kaldi/egs/hkust/s5_daihou \
+#  --num-job 10
diff --git a/egs/aishell/asr1/local/aishell_data_prep_infer.sh b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
index c598ed5fc13..157b5c8dd4c 100755
--- a/egs/aishell/asr1/local/aishell_data_prep_infer.sh
+++ b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
@@ -12,7 +12,7 @@ if [ $# != 2 ]; then
 fi
 
 aishell_audio_dir=$1
-aishell_text=$2/aishell_transcript_v0.8.txt.0604.relabeled.0923.v2
+aishell_text=$2/aishell_transcript.txt
 
 #train_dir=data/local/train
 #dev_dir=data/local/dev

From 5e8c7bf0f345836f6bf549f01fd0821aa41cc7c6 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Mon, 28 Oct 2019 19:05:38 +0800
Subject: [PATCH 13/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=BE=93=E5=87=BA?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 egs/aishell/asr1/infer.sh                     | 23 +++++++++++++------
 .../asr1/local/aishell_data_prep_infer.sh     | 17 ++++++++------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/egs/aishell/asr1/infer.sh b/egs/aishell/asr1/infer.sh
index 731d21832d0..974eb830d31 100755
--- a/egs/aishell/asr1/infer.sh
+++ b/egs/aishell/asr1/infer.sh
@@ -6,6 +6,15 @@
 . ./path.sh || exit 1;
 . ./cmd.sh || exit 1;
 
+if [ $# != 2 ]; then
+  echo "Usage: $0 <data-path> <flag>"
+  exit 1;
+else
+  data=$1  # 输入文件夹。
+  flag=$2  # 唯一标识，防止文件覆盖。
+fi
+
+
 # general configuration
 backend=pytorch
 stage=0        # start from 0 if you need to start from data preparation
@@ -37,7 +46,7 @@ n_average=10
 #data_url=www.openslr.org/resources/33
 
 # data=/data/nipeng/2019-10-07-aishell
-data=/data/nipeng/2019-1001-1020-aishell
+# data=/data/nipeng/2019-1001-1020-aishell
 
 # exp tag
 tag="" # tag for managing experiments.
@@ -53,7 +62,7 @@ set -o pipefail
 train_set=train_sp
 train_dev=dev
 #recog_set="dev test"
-recog_set="infer"
+recog_set=infer_${flag}
 
 # if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 #     echo "stage -1: Data Download"
@@ -65,9 +74,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 0: Data preparation"
-    local/aishell_data_prep_infer.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript
+    local/aishell_data_prep_infer.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript ${flag}
     # remove space in text
-    for x in infer; do
+    for x in ${recog_set}; do
         cp data/${x}/text data/${x}/text.org
         paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
             > data/${x}/text
@@ -81,7 +90,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ### Task dependent. You have to design training and dev sets by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Feature Generation"
-    fbankdir=fbank
+    fbankdir=fbank_${flag}
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
     # steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
     #     data/train exp/make_fbank/train ${fbankdir}
@@ -90,8 +99,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     #     data/dev exp/make_fbank/dev ${fbankdir}
     # utils/fix_data_dir.sh data/dev
     steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 --write_utt2num_frames true \
-        data/infer exp/make_fbank/infer ${fbankdir}
-    utils/fix_data_dir.sh data/infer
+        data/${recog_set} exp/make_fbank/${recog_set} ${fbankdir}
+    utils/fix_data_dir.sh data/${recog_set}
 
     # # speed-perturbed
     # utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1
diff --git a/egs/aishell/asr1/local/aishell_data_prep_infer.sh b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
index f4b75d102a7..7d5ec817521 100755
--- a/egs/aishell/asr1/local/aishell_data_prep_infer.sh
+++ b/egs/aishell/asr1/local/aishell_data_prep_infer.sh
@@ -5,24 +5,27 @@
 
 . ./path.sh || exit 1;
 
-if [ $# != 2 ]; then
-  echo "Usage: $0 <audio-path> <text-path>"
-  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <flag>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript uuid"
   exit 1;
 fi
 
 aishell_audio_dir=$1
 aishell_text=$2/aishell_transcript_v0.8.txt.0604.relabeled.0923.v2
+flag=$3
 
 #train_dir=data/local/train
 #dev_dir=data/local/dev
 #test_dir=data/local/test
-infer_dir=data/local/infer
-tmp_dir=data/local/tmp
+raw_infer_dir=data/infer_${flag}
+infer_dir=data/local/infer_${flag}
+tmp_dir=data/local/tmp_${flag}
 
 #mkdir -p $train_dir
 #mkdir -p $dev_dir
 #mkdir -p $test_dir
+mkdir -p $raw_infer_dir
 mkdir -p $infer_dir
 mkdir -p $tmp_dir
 
@@ -56,10 +59,10 @@ for dir in $infer_dir; do
   utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 done
 
-mkdir -p data/infer
+## mkdir -p data/infer_${flag}
 
 for f in spk2utt utt2spk wav.scp text; do
-  cp $infer_dir/$f data/infer/$f || exit 1;
+  cp $infer_dir/$f ${raw_infer_dir}/$f || exit 1;
 done
 
 echo "$0: AISHELL data preparation succeeded"

From aaef5d15f0253ebd5ca7bc804e9174decf9f8a6c Mon Sep 17 00:00:00 2001
From: root <root@gpu-001.qianjin.com>
Date: Tue, 29 Oct 2019 09:42:53 +0800
Subject: [PATCH 14/23] =?UTF-8?q?=E5=A2=9E=E5=8A=A0conda=E6=BA=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 egs/aishell/asr1/asr_pipline.py | 5 ++++-
 tools/Makefile                  | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/egs/aishell/asr1/asr_pipline.py b/egs/aishell/asr1/asr_pipline.py
index 50bb04b88e4..a8f9b24b6fb 100644
--- a/egs/aishell/asr1/asr_pipline.py
+++ b/egs/aishell/asr1/asr_pipline.py
@@ -388,7 +388,10 @@ def _asr_cmd(self, wav_path):
         :param wav_path: 需识别音频存放目录
         :return:
         """
-        decode_cmd = "./infer.sh {}".format(wav_path)
+        print("==1", wav_path)
+        flag = wav_path.split("_")[-2]
+        print(flag)
+        decode_cmd = "./infer.sh {} {}".format(wav_path, flag)
         print(decode_cmd)
         os.system(decode_cmd)
         ## return merge_dict
diff --git a/tools/Makefile b/tools/Makefile
index d3c6a38ede3..11f9cd51f01 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -68,14 +68,22 @@ venv: miniconda.sh
 	. venv/bin/activate && conda update -y conda
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+	. venv/bin/activate && conda config --add channels https://mirror.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/menpo/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
 	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y python=$(PYTHON_VERSION)
 	. venv/bin/activate && conda info -a
 espnet.done: venv
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+	. venv/bin/activate && conda config --add channels https://mirror.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/menpo/
+	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
 	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) -c pytorch
 	. venv/bin/activate && pip install -e ..

From 9cb916b6b4da826c59e6cb1e69cbd5748eabaa42 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 09:57:10 +0800
Subject: [PATCH 15/23] =?UTF-8?q?=E5=AE=89=E8=A3=85pytorch=E4=BD=BF?=
 =?UTF-8?q?=E7=94=A8=E6=8C=87=E5=AE=9A=E7=9A=84=E6=B8=85=E5=8D=8E=E6=BA=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 11f9cd51f01..436085278a6 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -85,7 +85,7 @@ espnet.done: venv
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/menpo/
 	. venv/bin/activate && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
 	. venv/bin/activate && conda config --set show_channel_urls yes
-	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) -c pytorch
+	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) pytorch
 	. venv/bin/activate && pip install -e ..
 	touch espnet.done
 endif

From 9e0d678dcec11f1497104ccc274af4ab466af62a Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 10:17:24 +0800
Subject: [PATCH 16/23] =?UTF-8?q?=E5=88=A0=E9=99=A4cudatookit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index 436085278a6..43e29c3b995 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -20,13 +20,13 @@ CONDA_URL := https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.
 GCC_VERSION := $(shell gcc -dumpversion)
 
 
-ifneq ($(shell which nvidia-smi),) # 'nvcc' found
-CONDA_PYTORCH := pytorch=$(TH_VERSION) cudatoolkit=$(CUDA_VERSION)
-CUDA_DEPS := cupy.done
-else
+# ifneq ($(shell which nvidia-smi),) # 'nvcc' found
+# CONDA_PYTORCH := pytorch=$(TH_VERSION) cudatoolkit=$(CUDA_VERSION)
+# CUDA_DEPS := cupy.done
+# else
 CONDA_PYTORCH := pytorch-cpu=$(TH_VERSION)
 CUDA_DEPS :=
-endif
+# endif
 
 
 .PHONY: all clean

From 625f19e81eb8f6abfe8df9298915e6b0cb047c0a Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 15:02:26 +0800
Subject: [PATCH 17/23] add

---
 egs/aishell/asr1/asr_pipline.py | 148 ++++++++++++--------------------
 egs/aishell/asr1/asr_pipline.sh |   4 +-
 egs/aishell/asr1/infer.sh       |   7 +-
 3 files changed, 63 insertions(+), 96 deletions(-)
 mode change 100644 => 100755 egs/aishell/asr1/asr_pipline.sh

diff --git a/egs/aishell/asr1/asr_pipline.py b/egs/aishell/asr1/asr_pipline.py
index 50bb04b88e4..b88f66eaf64 100644
--- a/egs/aishell/asr1/asr_pipline.py
+++ b/egs/aishell/asr1/asr_pipline.py
@@ -28,7 +28,6 @@
 
 
 def norm_aishell_data(indir):
-    # /tmp/asr_9hmlx3zl
     outdir = indir + "_norm"
     for path in os.listdir(indir):
         fname = path.split(".")[0]
@@ -52,10 +51,22 @@ def gen_sp_wav_and_get_path_one(wav_temppath, audio_id, sound, item, k):
     if sp_wav.frame_rate != 8000:
         sp_wav = sp_wav.set_frame_rate(8000)
     sp_wav.export(save_cut_path, format="wav")
-    print("==3", save_cut_path)
     return save_cut_path
 
 
+def load_hyp_file(path):
+    merge_dict = {}
+    with codecs.open(path) as f:
+        for line in f:
+            l = line.split("(")
+            if len(l) == 2:
+                _text, _id = l
+                text = _text.strip().replace(" ", "")
+                path = _id.split("-")[0] + ".wav"
+                merge_dict[path] = text
+    return merge_dict
+
+
 def get_parser():
     parser = argparse.ArgumentParser(description='语音识别主函数参数')
     parser.add_argument("-kh", "--kafka-host",
@@ -101,7 +112,7 @@ def get_parser():
                         help="multiprocess number of workers")
 
     parser.add_argument("-cg", "--consumer-gap",
-                        type=int, default=2,
+                        type=int, default=10,
                         help="kafka consumer msg num")
 
     parser.add_argument("-ptm", "--poll-timeout-ms",
@@ -146,6 +157,7 @@ def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                                        compression_type="gzip",
                                        max_request_size=1024 * 1024 * 20)
         self.asr_producer_topics = asr_producer_topics  # ASR生产者topic
+        self.redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
 
         self.is_comp = is_comp
         if is_comp:
@@ -175,7 +187,6 @@ def asr_pipline_from_kafka(self, father_path):
         :param father_path: 切分来源音频存放父目录
         :return:
         """
-        # redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
         while True:
             if not self.from_client:
                 self._get_from_client()
@@ -185,9 +196,7 @@ def asr_pipline_from_kafka(self, father_path):
             msgs = []
             for tp, _msgs in tp_msgs.items():
                 msgs.extend(_msgs)
-            print(len(msgs))
             self.batch_asr_pipline(father_path, msgs)
-            break
 
     def batch_asr_pipline(self, father_path, msgs):
         """
@@ -207,41 +216,35 @@ def batch_asr_pipline(self, father_path, msgs):
         batch_voice_data = {}
         batch_merge_dict = None
 
-#        try:
-        redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
-        for msg in msgs:
-            if msg is not None:
-                audio_id = json.loads(msg.value).get('audio_id', '')
-                print("==2", audio_id)
-                if not redis_client.get('asr_espnet_' + str(audio_id)):
-                    # step1: 从kafka消息中提取value信息
-                    voice_data = json.loads(msg.value)
-                    voice_data['start_asr'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    batch_voice_data[audio_id] = voice_data  # 后面继续往 voice_data 追加数据
-        # step2：根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
-        batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data)
-        # step3: 语音识别, 并获取merge的text
-        # batch results
-        print("==4", wav_temppath)
-        wav_normpath = norm_aishell_data(wav_temppath)
-        merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
-        #batch_merge_dict = self.split_merge_dict(merge_dict)
-#        except Exception as e:
-#            print("bebefore commit to kafka error log: %s, msg: %s" % (e, ""))
-#        finally:
-#            # step7: 删除临时音频的文件夹和语音识别结果的临时文件
-#            #shutil.rmtree(wav_temppath)
-#            pass
-        return
+        try:
+            for msg in msgs:
+                if msg is not None:
+                    audio_id = json.loads(msg.value).get('audio_id', '')
+                    if not redis_client.get('asr_espnet_' + str(audio_id)):
+                        # step1: 从kafka消息中提取value信息
+                        voice_data = json.loads(msg.value)
+                        voice_data['start_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        batch_voice_data[audio_id] = voice_data  # 后面继续往 voice_data 追加数据
+            # step2：根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
+            batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data)
+            # step3: 语音识别, 并获取merge的text
+            # batch results
+            wav_normpath = norm_aishell_data(wav_temppath)
+            merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
+            batch_merge_dict = self.split_merge_dict(merge_dict)
+        except Exception as e:
+            print("bebefore commit to kafka error log: %s, msg: %s" % (e, ""))
+        finally:
+            # step7: 删除临时音频的文件夹和语音识别结果的临时文件
+            shutil.rmtree(wav_temppath)
+            shutil.rmtree(wav_normpath)
 
         for audio_id, voice_data in batch_voice_data.items():
             # step4: 写入kafka新的topic
             try:
-                voice_data["merge_dict"] = batch_merge_dict[audio_id]
-                voice_data['end_asr'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                voice_data["merge_dict_espnet"] = batch_merge_dict[audio_id]
+                voice_data['end_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                 flag = self._kafka_producers(voice_data) if voice_data.get("merge_dict", {}) else False
-                voice_data['step4_end_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                print("asr_output:", json.dumps(voice_data, ensure_ascii=True))
             except Exception as e:
                 print("before commit to kafka error log: %s, msg: %s" % (e, ""))
         try:
@@ -249,7 +252,7 @@ def batch_asr_pipline(self, father_path, msgs):
         except CommitFailedError:
             print('commit error!')
             for audio_id, _ in batch_voice_data.items():
-                redis_client.set('asr_espnet_' + str(audio_id), 1, ex=24 * 3600)
+                self.redis_client.set('asr_espnet_' + str(audio_id), 1, ex=24 * 3600)
 
         return flag
 
@@ -300,57 +303,6 @@ def filter_conditions(self, voice_data):
                 raise RuntimeError('This data segments is null !')
         self.comp_num += 1
 
-    def _copy_wav(self, father_path, voice_data, temp_path):
-        """
-        step2: 根据msg信息从切分好的数据源复制到目标位置
-        :param father_path: 切分来源音频存放父目录
-        :param voice_data: kafka来源信息中的包含切分音频的信息
-        :param temp_path: 音频复制目标目录
-        :return: wav_lst:
-        """
-        source = voice_data.get("source", "")
-        date = voice_data.get("date", "")
-        audio_id = voice_data.get("audio_id", "")
-
-        value_dict = voice_data.get("segments", {})
-        wav_lst = []  # 音频list
-
-        for chn, split_time in value_dict.items():
-            for si in split_time:
-                split_path = "%s_%s_%s_%s.wav" % (audio_id, si.get("start"), si.get("end"), chn)
-                wav_path = os.path.join(father_path, source, date, audio_id, split_path)
-                shutil.copy(wav_path, temp_path)
-                wav_lst.append(wav_path)
-        return wav_lst
-
-    def gen_sp_wav_and_get_path(self, father_path, wav_temppath, voice_data):
-        """
-        step2: 根据msg信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
-        :param father_path: 音频来源目录,结果存储
-        :param wav_temppath: kafka来源信息中的包含切分音频的信息
-        :return: wav_lst:
-        """
-        wav_lst = []  # wav 音频存储
-        source = voice_data["source"]
-        code = None if source != 'infobird' else 'pcm_s16le'
-        date = voice_data["date"]
-        audio_id = voice_data["audio_id"]
-        wav_father_path = os.path.join(father_path, source, date)  # /data/mfs/k8s/speech_pipeline/raw/{source}/{date}
-        for k, v in voice_data['segments'].items():
-            if k in ["gk", "kf"] and len(v) > 0:
-                wav_name = "%s_%s.wav" % (audio_id, k)
-                raw_wav_path = os.path.join(wav_father_path, wav_name)
-                sound = AudioSegment.from_file(raw_wav_path, codec=code, format="wav")
-                for item in v:
-                    cut_wav_name = "%s_%s_%s_%s.wav" % (audio_id, item['start'], item['end'], k)
-                    save_cut_path = os.path.join(wav_temppath, cut_wav_name)
-                    sp_wav = sound[int(item['start']):int(item['end'])]
-                    if sp_wav.frame_rate != 8000:
-                        sp_wav = sp_wav.set_frame_rate(8000)
-                    sp_wav.export(save_cut_path, format="wav")
-                    wav_lst.append(save_cut_path)
-        return wav_lst
-
     def gen_sp_wav_and_get_path_mp(self, father_path, wav_temppath, batch_voice_data):
         """
         multiprocess generator
@@ -388,10 +340,24 @@ def _asr_cmd(self, wav_path):
         :param wav_path: 需识别音频存放目录
         :return:
         """
-        decode_cmd = "./infer.sh {}".format(wav_path)
-        print(decode_cmd)
+        flag = wav_path.split("/")[-1].replace("asr", "").replace("norm", "")
+        decode_cmd = "./infer.sh {} {} {}".format(wav_path, flag, self.num_job)
         os.system(decode_cmd)
-        ## return merge_dict
+        decode_path = "exp/train_sp_pytorch_train/decode_infer_%s_decode_lm/hyp.trn" %flag
+        merge_dict = load_hyp_file(decode_path)
+
+        ## 删除临时目录
+        data_dir = "data/infer_%s" %flag
+        fbank_dir = "fbank_%s" %flag
+        dump_dir = "dump/infer_%s" %flag
+        decode_dir = "exp/train_sp_pytorch_train/decode_infer_%s_decode_lm" %flag
+        dump_feats_dir = "exp/dump_feats/recog/infer_%s" %flag
+        make_fbank_dir = "exp/make_fbank/infer_%s" %flag
+
+        for _dir in [data_dir, fbank_dir, dump_dir, dump_feats_dir, make_fbank_dir, decode_dir]:
+            os.system("rm -rf %s" %_dir)
+
+        return merge_dict
 
     def _kafka_producers(self, voice_data):
         """
diff --git a/egs/aishell/asr1/asr_pipline.sh b/egs/aishell/asr1/asr_pipline.sh
old mode 100644
new mode 100755
index 52af73ac321..157d38245ba
--- a/egs/aishell/asr1/asr_pipline.sh
+++ b/egs/aishell/asr1/asr_pipline.sh
@@ -13,12 +13,12 @@ python3 asr_pipline.py \
   --kafka-host daasoffline1.kafka.dc.puhuifinance.com:6667,daasoffline2.kafka.dc.puhuifinance.com:6667,daasoffline3.kafka.dc.puhuifinance.com:6667,daasoffline4.kafka.dc.puhuifinance.com:6667,daasoffline5.kafka.dc.puhuifinance.com:6667 \
   --seg-consumer-topics sp_sad_topic \
   --seg-consumer-groupid sp_sad_asr_group_np_20191028_v3 \
-  --session-timeout-ms 60000 \
+  --session-timeout-ms 30000 \
   --seg-auto-offset-reset smallest \
   --asr-producer-topics asr_topic1_t1 \
   --father-path /data/mfs/k8s/speech_pipeline/raw \
   --hkust-path /home/app/hkust/kaldi/egs/hkust/s5_daihou \
-  --num-job 40
+  --num-job 10
 #  --is-comp 1
 
 
diff --git a/egs/aishell/asr1/infer.sh b/egs/aishell/asr1/infer.sh
index 974eb830d31..3274b4b92df 100755
--- a/egs/aishell/asr1/infer.sh
+++ b/egs/aishell/asr1/infer.sh
@@ -6,12 +6,13 @@
 . ./path.sh || exit 1;
 . ./cmd.sh || exit 1;
 
-if [ $# != 2 ]; then
-  echo "Usage: $0 <data-path> <flag>"
+if [ $# != 3 ]; then
+  echo "Usage: $0 <data-path> <flag> <num_workers>"
   exit 1;
 else
   data=$1  # 输入文件夹。
   flag=$2  # 唯一标识，防止文件覆盖。
+  nj_decoder=$3 # decode workers
 fi
 
 
@@ -223,7 +224,7 @@ expdir=exp/${expname}
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "stage 5: Decoding"
-    nj=40
+    nj=${nj_decoder}
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
 	recog_model=model.last${n_average}.avg.best
 	# average_checkpoints.py --backend ${backend} \

From e7fe3a168a002c082cc754f032c0eb76ba63c12c Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 16:20:56 +0800
Subject: [PATCH 18/23] =?UTF-8?q?add:=20=E8=BF=87=E6=BB=A4=E5=85=B3?=
 =?UTF-8?q?=E9=94=AE=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 egs/aishell/asr1/asr_pipline.py | 81 ++++++++++++++++++++++-----------
 egs/aishell/asr1/asr_pipline.sh |  2 +-
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/egs/aishell/asr1/asr_pipline.py b/egs/aishell/asr1/asr_pipline.py
index b88f66eaf64..69abba444d5 100644
--- a/egs/aishell/asr1/asr_pipline.py
+++ b/egs/aishell/asr1/asr_pipline.py
@@ -7,6 +7,7 @@
     sys.setdefaultencoding("utf-8")
 
 import os
+import re
 import time
 import json
 import redis
@@ -27,8 +28,8 @@
 from elasticsearch import Elasticsearch, helpers
 
 
-def norm_aishell_data(indir):
-    outdir = indir + "_norm"
+def norm_aishell_data(indir, outdir):
+    os.system("mkdir -p %s" %outdir)
     for path in os.listdir(indir):
         fname = path.split(".")[0]
         os.system("mkdir -p %s/data_aishell/wav/infer/%s" %(outdir, fname))
@@ -41,7 +42,6 @@ def norm_aishell_data(indir):
     for path in os.listdir(indir):
         fname = path.split(".")[0]
         fout.write("%s %s\n" %(fname, "哈哈"))
-    return outdir
 
 
 def gen_sp_wav_and_get_path_one(wav_temppath, audio_id, sound, item, k):
@@ -104,7 +104,7 @@ def get_parser():
                         help="hkust的绝对目录,即kaldi的hkust目录")
 
     parser.add_argument("-nj", "--num-job",
-                        type=int, default=10,
+                        type=int, default=2,
                         help="hkust num job default 10")
 
     parser.add_argument("-nw", "--num-workers",
@@ -124,6 +124,22 @@ def get_parser():
     return args
 
 
+class WarnKeyword(object):
+    def __init__(self):
+        kws = ["狗杂种", "操你妈", "傻逼", "他妈的","你妈逼",
+               "狗日的","王八蛋", "妈了个逼","婊子", "去你妈",
+               "我操", "我草","贱人", "被车撞死", "搞死",
+               "密码给我", "老赖","曝通讯录", "所有联系人", "不要脸",
+               "去死","要不要脸", "打爆你",
+        ]
+        self.p_kw = re.compile("|".join(kws))
+
+    def process(self, text):
+        l = self.p_kw.search(text)
+        rst = True if l else False
+        return rst
+
+
 class ASR(object):
     def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                  session_timeout_ms=60000, seg_auto_offset_reset="largest",
@@ -158,6 +174,7 @@ def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                                        max_request_size=1024 * 1024 * 20)
         self.asr_producer_topics = asr_producer_topics  # ASR生产者topic
         self.redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
+        self.kw_client = WarnKeyword()
 
         self.is_comp = is_comp
         if is_comp:
@@ -181,6 +198,17 @@ def _get_from_client(self):
                                          auto_offset_reset=self.seg_auto_offset_reset)  # 消费重置偏移量
         self.from_client.subscribe(self.seg_consumer_topics)  # 切分的消费者topic
 
+    def filter_msg_keyword(self, msgs):
+        rst = []
+        for msg in msgs:
+            data = json.loads(msg.value)
+            merge_dict = msg.get("merge_dict")
+            if merge_dict:
+                text = ";".join(merge_dict.values())
+                if self.kw_client.process(text):
+                    rst.append(msg)
+        return rst
+
     def asr_pipline_from_kafka(self, father_path):
         """
         获取kafka的数据流，并进行识别，合并，标点，存入es
@@ -196,22 +224,19 @@ def asr_pipline_from_kafka(self, father_path):
             msgs = []
             for tp, _msgs in tp_msgs.items():
                 msgs.extend(_msgs)
-            self.batch_asr_pipline(father_path, msgs)
+
+            ## 从kaldi识别结果中，过滤出包含告警关键词的数据
+            msgs = self.filter_msg_keyword(msgs)
+
+            if msgs:
+                self.batch_asr_pipline(father_path, msgs)
+
 
     def batch_asr_pipline(self, father_path, msgs):
-        """
-        单个kafka消息消费:
-        1)从kafka消息中提取value信息;
-        2)根据msg.value信息从切分好的数据源复制到目标位置;
-        3)进行语音识别,放进merge_dict里;
-        4)写入kafka新的topic;
-        7)删除临时音频的文件夹和语音识别结果的临时文件.
-        :param father_path: 切分来源音频存放父目录;
-        :param msg: 从kafka获取的完整消息;
-        :return:
-        """
         flag = False  # flag 是否语音识别成功并存入kafka
         wav_temppath = tempfile.mkdtemp(prefix="asr_")
+        wav_normpath = wav_temppath + "_norm"
+
         batch_wav_lst = []
         batch_voice_data = {}
         batch_merge_dict = None
@@ -220,33 +245,35 @@ def batch_asr_pipline(self, father_path, msgs):
             for msg in msgs:
                 if msg is not None:
                     audio_id = json.loads(msg.value).get('audio_id', '')
-                    if not redis_client.get('asr_espnet_' + str(audio_id)):
-                        # step1: 从kafka消息中提取value信息
+                    if not self.redis_client.get('asr_espnet_' + str(audio_id)):
                         voice_data = json.loads(msg.value)
                         voice_data['start_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                         batch_voice_data[audio_id] = voice_data  # 后面继续往 voice_data 追加数据
-            # step2：根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
+
+            # 根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
             batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data)
-            # step3: 语音识别, 并获取merge的text
-            # batch results
-            wav_normpath = norm_aishell_data(wav_temppath)
+            # 语音识别, 并获取merge的text
+            # 整理成espnet需要的数据格式
+            norm_aishell_data(wav_temppath, wav_normpath)
+            # espnet decode
             merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
+            # merge batch
             batch_merge_dict = self.split_merge_dict(merge_dict)
         except Exception as e:
-            print("bebefore commit to kafka error log: %s, msg: %s" % (e, ""))
+            print("asr cmd error log: %s, msg: %s" % (e, ""))
         finally:
-            # step7: 删除临时音频的文件夹和语音识别结果的临时文件
+            # 删除临时音频的文件夹和语音识别结果的临时文件
             shutil.rmtree(wav_temppath)
             shutil.rmtree(wav_normpath)
 
         for audio_id, voice_data in batch_voice_data.items():
-            # step4: 写入kafka新的topic
+            # 写入kafka新的topic
             try:
                 voice_data["merge_dict_espnet"] = batch_merge_dict[audio_id]
                 voice_data['end_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                flag = self._kafka_producers(voice_data) if voice_data.get("merge_dict", {}) else False
+                flag = self._kafka_producers(voice_data) if voice_data.get("merge_dict_espnet", {}) else False
             except Exception as e:
-                print("before commit to kafka error log: %s, msg: %s" % (e, ""))
+                print("kafka producer error log: %s, msg: %s" % (e, ""))
         try:
             self.from_client.commit()
         except CommitFailedError:
diff --git a/egs/aishell/asr1/asr_pipline.sh b/egs/aishell/asr1/asr_pipline.sh
index 157d38245ba..814763c2911 100755
--- a/egs/aishell/asr1/asr_pipline.sh
+++ b/egs/aishell/asr1/asr_pipline.sh
@@ -11,7 +11,7 @@
 # 测试需要在代码内修改生产则kafka host
 python3 asr_pipline.py \
   --kafka-host daasoffline1.kafka.dc.puhuifinance.com:6667,daasoffline2.kafka.dc.puhuifinance.com:6667,daasoffline3.kafka.dc.puhuifinance.com:6667,daasoffline4.kafka.dc.puhuifinance.com:6667,daasoffline5.kafka.dc.puhuifinance.com:6667 \
-  --seg-consumer-topics sp_sad_topic \
+  --seg-consumer-topics sp_asr_topic \
   --seg-consumer-groupid sp_sad_asr_group_np_20191028_v3 \
   --session-timeout-ms 30000 \
   --seg-auto-offset-reset smallest \

From 1bc353d92beae8fdaf3c9920381bf56c8cf80381 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 17:30:22 +0800
Subject: [PATCH 19/23] add: pip install

---
 tools/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index 43e29c3b995..33c991e07aa 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -59,6 +59,7 @@ espnet.done: venv
 	. venv/bin/activate; pip install -e ..
 	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
 	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ torch==$(TH_VERSION)
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka elasticsearch
 	touch espnet.done
 else
 miniconda.sh:
@@ -87,6 +88,7 @@ espnet.done: venv
 	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) pytorch
 	. venv/bin/activate && pip install -e ..
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka elasticsearch
 	touch espnet.done
 endif
 

From b836d488336a5cf47fd819f8ccfe8849d707a9b3 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Tue, 29 Oct 2019 17:47:26 +0800
Subject: [PATCH 20/23] add: pip index

---
 tools/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index 33c991e07aa..e1e8cda612b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -128,7 +128,7 @@ warp-transducer.done: espnet.done
 chainer_ctc.done: espnet.done
 	rm -rf chainer_ctc
 	git clone https://github.com/jheymann85/chainer_ctc.git
-	. venv/bin/activate; pip install cython
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ cython
 	. venv/bin/activate; cd chainer_ctc && chmod +x install_warp-ctc.sh && ./install_warp-ctc.sh ; true
 	. venv/bin/activate; cd chainer_ctc && pip install .
 	touch chainer_ctc.done
@@ -147,7 +147,7 @@ mecab.done: espnet.done
 	cd mecab/mecab-ipadic && ./configure --with-charset=utf8 --with-mecab-config=$(PWD)/mecab/bin/mecab-config --prefix=$(PWD)/mecab && $(MAKE) && $(MAKE) install
 	cd mecab && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
 	cd mecab/mecab-ipadic-neologd && export PATH=$(PWD)/mecab/bin:$(PATH) && ./bin/install-mecab-ipadic-neologd -n -y -p $(PWD)/mecab/mecab-ipadic-neologd
-	. venv/bin/activate; pip install mojimoji pykakasi
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ mojimoji pykakasi
 	. venv/bin/activate; if [ `python --version | cut -c 8` -eq 3 ]; then \
 		if [ ! -e swig.done ]; then \
 			rm -rf swig; \

From 116b79585c0b76ff0062d181462ace1072295736 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Wed, 30 Oct 2019 10:14:48 +0800
Subject: [PATCH 21/23] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=98=AF=E5=90=A6?=
 =?UTF-8?q?=E5=91=BD=E4=B8=AD=E5=91=8A=E8=AD=A6=E5=85=B3=E9=94=AE=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker/infer.sh                               |  12 +-
 .../asr1/{asr_pipline.py => asr_pipeline.py}  | 133 ++++++------------
 .../asr1/{asr_pipline.sh => asr_pipeline.sh}  |   5 +-
 egs/aishell/asr1/test.py                      |  11 +-
 tools/Makefile                                |   2 +-
 5 files changed, 58 insertions(+), 105 deletions(-)
 rename egs/aishell/asr1/{asr_pipline.py => asr_pipeline.py} (76%)
 rename egs/aishell/asr1/{asr_pipline.sh => asr_pipeline.sh} (96%)
 mode change 100755 => 100644

diff --git a/docker/infer.sh b/docker/infer.sh
index e7b6ec914a1..1e8de03482d 100755
--- a/docker/infer.sh
+++ b/docker/infer.sh
@@ -24,11 +24,11 @@ do
                     eval ${ext}=$2
                     frombreak=false
                     break 2
-                  fi 
-                done 
+                  fi
+                done
               done
               if ${frombreak} ; then
-                echo "bad option $1" 
+                echo "bad option $1"
                 exit 1
               fi
               ;;
@@ -47,7 +47,7 @@ fi
 from_tag="cpu"
 if [ ! "${docker_gpu}" == "-1" ]; then
   if [ -z "${docker_cuda}" ]; then
-    # If the docker_cuda is not set, the program will automatically 
+    # If the docker_cuda is not set, the program will automatically
     # search the installed version with default configurations (apt)
     docker_cuda=$( nvcc -V | grep release )
     docker_cuda=${docker_cuda#*"release "}
@@ -75,7 +75,7 @@ fi
 if [ ${docker_user} = true ]; then
   # Build a container with the user account
   container_tag="${from_tag}-user-${HOME##*/}"
-  docker_image=$( docker images -q espnet/espnet:${container_tag} ) 
+  docker_image=$( docker images -q espnet/espnet:${container_tag} )
   if ! [[ -n ${docker_image}  ]]; then
     echo "Building docker image..."
     build_args="--build-arg FROM_TAG=${from_tag}"
@@ -130,7 +130,7 @@ if [ ! -z "${docker_env}" ]; then
   docker_env=$(echo ${docker_env} | tr "," "\n")
   for i in ${docker_env[@]}
   do
-    this_env="-e $i ${this_env}" 
+    this_env="-e $i ${this_env}"
   done
 fi
 
diff --git a/egs/aishell/asr1/asr_pipline.py b/egs/aishell/asr1/asr_pipeline.py
similarity index 76%
rename from egs/aishell/asr1/asr_pipline.py
rename to egs/aishell/asr1/asr_pipeline.py
index 69abba444d5..408c8043b48 100644
--- a/egs/aishell/asr1/asr_pipline.py
+++ b/egs/aishell/asr1/asr_pipeline.py
@@ -11,7 +11,6 @@
 import time
 import json
 import redis
-import jieba
 import codecs
 import shutil
 import hashlib
@@ -25,7 +24,6 @@
 from pydub import AudioSegment
 from kafka import KafkaConsumer, KafkaProducer
 from kafka.errors import CommitFailedError
-from elasticsearch import Elasticsearch, helpers
 
 
 def norm_aishell_data(indir, outdir):
@@ -104,8 +102,8 @@ def get_parser():
                         help="hkust的绝对目录,即kaldi的hkust目录")
 
     parser.add_argument("-nj", "--num-job",
-                        type=int, default=2,
-                        help="hkust num job default 10")
+                        type=int, default=10,
+                        help="espnet decode num job default 10")
 
     parser.add_argument("-nw", "--num-workers",
                         type=int, default=18,
@@ -119,12 +117,11 @@ def get_parser():
                         type=int, default=60000,
                         help="")
 
-    parser.add_argument('--is-comp', help='1为补数,0为不进入补数筛选逻辑', default=0, type=int)
     args = parser.parse_args()
     return args
 
 
-class WarnKeyword(object):
+class DetectAlarmKeyword(object):
     def __init__(self):
         kws = ["狗杂种", "操你妈", "傻逼", "他妈的","你妈逼",
                "狗日的","王八蛋", "妈了个逼","婊子", "去你妈",
@@ -135,15 +132,14 @@ def __init__(self):
         self.p_kw = re.compile("|".join(kws))
 
     def process(self, text):
-        l = self.p_kw.search(text)
-        rst = True if l else False
+        rst = self.p_kw.findall(text)
         return rst
 
 
 class ASR(object):
     def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                  session_timeout_ms=60000, seg_auto_offset_reset="largest",
-                 asr_producer_topics="asr_topic1", num_job=10, is_comp=0,
+                 asr_producer_topics="asr_topic1", num_job=10,
                  poll_timeout_ms=60000, consumer_gap=None, num_workers=cpu_count()):
         """
         :param kafka_servers: kafka host:port
@@ -154,7 +150,6 @@ def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                                   源码定义: {'smallest': 'earliest', 'largest': 'latest'}
         :param asr_producer_topics: 语音是被的生产者topic,默认值:asr_topic1
         :param num_job: 语音识别的线程数
-        :param is_comp: 是否为补数逻辑,是的话会进入时间判断和es数据判断
         """
         self.kafka_servers = kafka_servers
         self.seg_consumer_groupid = seg_consumer_groupid
@@ -174,20 +169,7 @@ def __init__(self, kafka_servers, seg_consumer_topics, seg_consumer_groupid,
                                        max_request_size=1024 * 1024 * 20)
         self.asr_producer_topics = asr_producer_topics  # ASR生产者topic
         self.redis_client = redis.Redis(host='192.168.192.202', port=40029, db=0, password="Q8TYmIwQSHNFbLJ2")
-        self.kw_client = WarnKeyword()
-
-        self.is_comp = is_comp
-        if is_comp:
-            self.comp_num = 0
-            self.prop = {"HOST1": "192.168.40.37",
-                         "HOST2": "192.168.40.38",
-                         "HOST3": "192.168.40.39",
-                         "PORT": "9200",
-                         "DOC_TYPE": "kf_infobird_call"}
-
-            self.es = Elasticsearch(hosts=[{'host': self.prop['HOST1'], 'port': self.prop['PORT']},
-                                           {'host': self.prop['HOST2'], 'port': self.prop['PORT']},
-                                           {'host': self.prop['HOST3'], 'port': self.prop['PORT']}])
+        self.kw_client = DetectAlarmKeyword()
 
     def _get_from_client(self):
         # 消费者切分好的音频kafka消费者
@@ -198,15 +180,13 @@ def _get_from_client(self):
                                          auto_offset_reset=self.seg_auto_offset_reset)  # 消费重置偏移量
         self.from_client.subscribe(self.seg_consumer_topics)  # 切分的消费者topic
 
-    def filter_msg_keyword(self, msgs):
-        rst = []
-        for msg in msgs:
-            data = json.loads(msg.value)
-            merge_dict = msg.get("merge_dict")
-            if merge_dict:
-                text = ";".join(merge_dict.values())
-                if self.kw_client.process(text):
-                    rst.append(msg)
+    def check_alarm_keyword(self, merge_dict):
+        rst = None
+        if merge_dict:
+            text = ";".join(merge_dict.values())
+            kw_rsp = self.kw_client.process(text)
+            if kw_rsp:
+                rst = kw_rsp
         return rst
 
     def asr_pipline_from_kafka(self, father_path):
@@ -225,11 +205,7 @@ def asr_pipline_from_kafka(self, father_path):
             for tp, _msgs in tp_msgs.items():
                 msgs.extend(_msgs)
 
-            ## 从kaldi识别结果中，过滤出包含告警关键词的数据
-            msgs = self.filter_msg_keyword(msgs)
-
-            if msgs:
-                self.batch_asr_pipline(father_path, msgs)
+            self.batch_asr_pipline(father_path, msgs)
 
 
     def batch_asr_pipline(self, father_path, msgs):
@@ -238,7 +214,10 @@ def batch_asr_pipline(self, father_path, msgs):
         wav_normpath = wav_temppath + "_norm"
 
         batch_wav_lst = []
+        # 所有数据
         batch_voice_data = {}
+        # 包含报警关键词的数据
+        batch_voice_data_imp = {}
         batch_merge_dict = None
 
         try:
@@ -247,31 +226,34 @@ def batch_asr_pipline(self, father_path, msgs):
                     audio_id = json.loads(msg.value).get('audio_id', '')
                     if not self.redis_client.get('asr_espnet_' + str(audio_id)):
                         voice_data = json.loads(msg.value)
-                        voice_data['start_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                        batch_voice_data[audio_id] = voice_data  # 后面继续往 voice_data 追加数据
-
-            # 根据msg.value信息获取音频，并按提前提供的音频片段开始结束时间，生成切分后的wav
-            batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data)
-            # 语音识别, 并获取merge的text
-            # 整理成espnet需要的数据格式
-            norm_aishell_data(wav_temppath, wav_normpath)
-            # espnet decode
-            merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
-            # merge batch
-            batch_merge_dict = self.split_merge_dict(merge_dict)
+                        batch_voice_data[audio_id] = voice_data
+                        if self.check_alarm_keyword(voice_data.get("merge_dict")):
+                            voice_data['start_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                            batch_voice_data_imp[audio_id] = voice_data
+
+            if batch_voice_data_imp:
+                batch_wav_lst = self.gen_sp_wav_and_get_path_mp(father_path, wav_temppath, batch_voice_data_imp)
+                norm_aishell_data(wav_temppath, wav_normpath)
+                merge_dict = self._asr_cmd(wav_normpath) if batch_wav_lst else {}
+                batch_merge_dict = self.split_merge_dict(merge_dict)
         except Exception as e:
-            print("asr cmd error log: %s, msg: %s" % (e, ""))
+           print("asr cmd error log: %s, msg: %s" % (e, ""))
         finally:
             # 删除临时音频的文件夹和语音识别结果的临时文件
-            shutil.rmtree(wav_temppath)
-            shutil.rmtree(wav_normpath)
+            os.system("rm -rf %s" %wav_temppath)
+            os.system("rm -rf %s" %wav_normpath)
 
         for audio_id, voice_data in batch_voice_data.items():
-            # 写入kafka新的topic
             try:
-                voice_data["merge_dict_espnet"] = batch_merge_dict[audio_id]
-                voice_data['end_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                flag = self._kafka_producers(voice_data) if voice_data.get("merge_dict_espnet", {}) else False
+                if batch_merge_dict and audio_id in batch_merge_dict:
+                    voice_data["merge_dict_espnet"] = batch_merge_dict[audio_id]
+                    voice_data['end_asr_espnet'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    _alarm_rsp = self.check_alarm_keyword(batch_merge_dict[audio_id])
+                    if _alarm_rsp:
+                        voice_data["has_alarm_keyword"] = True
+                        voice_data["alarm_keywords"] = json.dumps(_alarm_rsp)
+                        voice_data["asr_model"] = "espnet_20191030"
+                flag = self._kafka_producers(voice_data)
             except Exception as e:
                 print("kafka producer error log: %s, msg: %s" % (e, ""))
         try:
@@ -296,40 +278,6 @@ def split_merge_dict(self, merge_dict):
             split_dict[audio_id][wav_file] = text
         return split_dict
 
-    def filter_conditions(self, voice_data):
-        """
-        筛选出不需要补数的数据,并抛出异常;
-        :param voice_data:
-        :return:
-        """
-        end_sad = voice_data.get("end_sad", "0000-00-00 00:00:00")
-        audio_id = voice_data.get("audio_id", None)
-        body = {
-            "query": {"bool": {"must": [{"term": {"audio_id.keyword": audio_id}}], "must_not": [], "should": []}},
-            "from": 0, "size": 10, "sort": [], "aggs": {}}
-        if end_sad < "2019-07-11 12:00:00" or end_sad > "2019-07-11 21:00:00":
-            raise RuntimeError('This one\'s end_sad not in time window !')
-        query = self.es.search(index="asr_text",
-                               doc_type=self.prop['DOC_TYPE'],
-                               scroll='5m',
-                               timeout='30s',
-                               body=body)
-        results = query['hits']['hits']  # es查询出的结果第一页
-        total = query['hits']['total']  # es查询出的结果总量
-        print("audio_id:", audio_id, "search num:", total, ",end_sad", end_sad)
-        if total > 0:
-            line = results[0]["_source"]
-            if line.get("cut_text"):
-                raise RuntimeError('This id already exists and already asr !')
-        else:
-            segments = voice_data.get("segments", {})
-            for k, v in segments.items():
-                if k in ["gk", "kf"] and len(v) > 0:
-                    break
-            else:
-                raise RuntimeError('This data segments is null !')
-        self.comp_num += 1
-
     def gen_sp_wav_and_get_path_mp(self, father_path, wav_temppath, batch_voice_data):
         """
         multiprocess generator
@@ -424,7 +372,6 @@ def _create_id_by_input(self, id=""):
     seg_consumer_groupid, asr_producer_topics = args.seg_consumer_groupid, args.asr_producer_topics  # 语音识别生产者参数
     father_path, hkust_path = args.father_path, args.hkust_path  # 音频与模型相关参数
     num_job = args.num_job  # 线程数
-    is_comp = args.is_comp  # 是否走补数过滤流程
     poll_timeout_ms = args.poll_timeout_ms
     consumer_gap = args.consumer_gap
     num_workers = args.num_workers
@@ -440,7 +387,7 @@ def _create_id_by_input(self, id=""):
         seg_consumer_topics = [seg_consumer_topics, ]
     asr = ASR(kafka_servers=kafka_host, seg_consumer_topics=seg_consumer_topics, session_timeout_ms=session_timeout_ms,
               seg_consumer_groupid=seg_consumer_groupid, seg_auto_offset_reset=seg_auto_offset_reset,
-              asr_producer_topics=asr_producer_topics, num_job=num_job, is_comp=is_comp,
+              asr_producer_topics=asr_producer_topics, num_job=num_job,
               poll_timeout_ms=poll_timeout_ms, consumer_gap=consumer_gap, num_workers=num_workers)
 
     asr.asr_pipline_from_kafka(father_path)
diff --git a/egs/aishell/asr1/asr_pipline.sh b/egs/aishell/asr1/asr_pipeline.sh
old mode 100755
new mode 100644
similarity index 96%
rename from egs/aishell/asr1/asr_pipline.sh
rename to egs/aishell/asr1/asr_pipeline.sh
index 814763c2911..56b85396a9e
--- a/egs/aishell/asr1/asr_pipline.sh
+++ b/egs/aishell/asr1/asr_pipeline.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/bash
+#!/bin/bash
 #vim /etc/hosts
 ## Entries added by HostAliases.
 #192.168.176.181	daasoffline1.kafka.dc.puhuifinance.com
@@ -9,7 +9,7 @@
 
 # test
 # 测试需要在代码内修改生产则kafka host
-python3 asr_pipline.py \
+/espnet/tools/venv/bin/python asr_pipeline.py \
   --kafka-host daasoffline1.kafka.dc.puhuifinance.com:6667,daasoffline2.kafka.dc.puhuifinance.com:6667,daasoffline3.kafka.dc.puhuifinance.com:6667,daasoffline4.kafka.dc.puhuifinance.com:6667,daasoffline5.kafka.dc.puhuifinance.com:6667 \
   --seg-consumer-topics sp_asr_topic \
   --seg-consumer-groupid sp_sad_asr_group_np_20191028_v3 \
@@ -19,7 +19,6 @@ python3 asr_pipline.py \
   --father-path /data/mfs/k8s/speech_pipeline/raw \
   --hkust-path /home/app/hkust/kaldi/egs/hkust/s5_daihou \
   --num-job 10
-#  --is-comp 1
 
 
 ## online
diff --git a/egs/aishell/asr1/test.py b/egs/aishell/asr1/test.py
index bfb1cc33981..336c1c7b4af 100644
--- a/egs/aishell/asr1/test.py
+++ b/egs/aishell/asr1/test.py
@@ -4,7 +4,6 @@
 import codecs
 
 
-
 def get_infer_result():
     path = "/data/nipeng/TTS/espnet/egs/aishell/asr1/exp/train_sp_pytorch_train/decode_infer_decode_lm/hyp.trn"
     fout = codecs.open("infer.txt", "w")
@@ -16,7 +15,15 @@ def get_infer_result():
             aname = aid + ".wav"
             fout.write("%s\t%s\n" %(aname, text))
 
+def test2():
+    from asr_pipeline import DetectAlarmKeyword
+    kw_client = DetectAlarmKeyword()
+    text = "你的是傻逼滚按时发大水发老赖法萨芬"
+    rst = kw_client.process(text)
+    print(rst)
+
 
 if __name__ == "__main__":
     print("ok")
-    get_infer_result()
+    #get_infer_result()
+    test2()
diff --git a/tools/Makefile b/tools/Makefile
index e1e8cda612b..a58e6dcaf79 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -59,7 +59,7 @@ espnet.done: venv
 	. venv/bin/activate; pip install -e ..
 	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
 	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ torch==$(TH_VERSION)
-	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka elasticsearch
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka-python elasticsearch pydub
 	touch espnet.done
 else
 miniconda.sh:

From 0b6cb891967f16cebe1cd39625cd7917023f3a65 Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Wed, 30 Oct 2019 11:33:26 +0800
Subject: [PATCH 22/23] add: pydub

---
 tools/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index a58e6dcaf79..48b31117dba 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -59,7 +59,7 @@ espnet.done: venv
 	. venv/bin/activate; pip install -e ..
 	. venv/bin/activate; pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
 	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ torch==$(TH_VERSION)
-	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka-python elasticsearch pydub
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka-python pydub
 	touch espnet.done
 else
 miniconda.sh:
@@ -88,7 +88,7 @@ espnet.done: venv
 	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) pytorch
 	. venv/bin/activate && pip install -e ..
-	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka elasticsearch
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka pydub
 	touch espnet.done
 endif
 

From 0f8a676ac5762ac9f0a41da53b72b57ed124fb7e Mon Sep 17 00:00:00 2001
From: nipeng <nipeng@iqianjin.com>
Date: Wed, 30 Oct 2019 12:18:58 +0800
Subject: [PATCH 23/23] fix: kafka-python

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 48b31117dba..8bc37784713 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -88,7 +88,7 @@ espnet.done: venv
 	. venv/bin/activate && conda config --set show_channel_urls yes
 	. venv/bin/activate && conda install -y $(CONDA_PYTORCH) pytorch
 	. venv/bin/activate && pip install -e ..
-	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka pydub
+	. venv/bin/activate; pip install --index https://pypi.mirrors.ustc.edu.cn/simple/ redis kafka-python pydub
 	touch espnet.done
 endif