From 6e71c76ba99b22cde395d9daf052a1c8af8a923b Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 26 Dec 2018 14:32:26 -0500 Subject: [PATCH] adding yomdle korean setup --- egs/ifnenit/{v1 => }/README.txt | 0 egs/madcat_ar/{v1 => }/README.txt | 0 egs/yomdle_korean/README.txt | 3 + egs/yomdle_korean/v1/cmd.sh | 12 + egs/yomdle_korean/v1/image | 1 + egs/yomdle_korean/v1/local/augment_data.sh | 36 ++ .../v1/local/chain/compare_wer.sh | 66 ++++ .../v1/local/chain/run_cnn_e2eali.sh | 1 + .../v1/local/chain/run_e2e_cnn.sh | 132 +++++++ .../local/chain/tuning/run_cnn_e2eali_1a.sh | 236 +++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1b.sh | 208 +++++++++++ egs/yomdle_korean/v1/local/check_tools.sh | 43 +++ .../v1/local/extract_features.sh | 48 +++ egs/yomdle_korean/v1/local/normalize_data.py | 18 + egs/yomdle_korean/v1/local/prepare_dict.sh | 26 ++ egs/yomdle_korean/v1/local/prepare_lexicon.py | 35 ++ egs/yomdle_korean/v1/local/process_corpus.py | 30 ++ egs/yomdle_korean/v1/local/process_data.py | 65 ++++ egs/yomdle_korean/v1/local/score.sh | 5 + .../run_cnn_chainali_semisupervised_1a.sh | 327 ++++++++++++++++++ .../run_cnn_chainali_semisupervised_1b.sh | 325 +++++++++++++++++ .../v1/local/semisup/process_data.py | 61 ++++ .../v1/local/semisup/run_semisup.sh | 71 ++++ egs/yomdle_korean/v1/local/train_lm.sh | 127 +++++++ egs/yomdle_korean/v1/local/wer_output_filter | 17 + egs/yomdle_korean/v1/local/yomdle | 1 + egs/yomdle_korean/v1/path.sh | 6 + egs/yomdle_korean/v1/run_end2end.sh | 186 ++++++++++ egs/yomdle_korean/v1/steps | 1 + egs/yomdle_korean/v1/utils | 1 + egs/yomdle_tamil/README.txt | 3 + .../create_line_image_from_page_image.py | 19 +- 32 files changed, 2105 insertions(+), 5 deletions(-) rename egs/ifnenit/{v1 => }/README.txt (100%) rename egs/madcat_ar/{v1 => }/README.txt (100%) create mode 100644 egs/yomdle_korean/README.txt create mode 100755 egs/yomdle_korean/v1/cmd.sh create mode 120000 egs/yomdle_korean/v1/image create mode 100755 egs/yomdle_korean/v1/local/augment_data.sh create mode 100755 egs/yomdle_korean/v1/local/chain/compare_wer.sh create mode 120000 egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh create mode 100755 egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh create mode 100755 egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh create mode 100755 egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh create mode 100755 egs/yomdle_korean/v1/local/check_tools.sh create mode 100755 egs/yomdle_korean/v1/local/extract_features.sh create mode 100755 egs/yomdle_korean/v1/local/normalize_data.py create mode 100755 egs/yomdle_korean/v1/local/prepare_dict.sh create mode 100755 egs/yomdle_korean/v1/local/prepare_lexicon.py create mode 100755 egs/yomdle_korean/v1/local/process_corpus.py create mode 100755 egs/yomdle_korean/v1/local/process_data.py create mode 100755 egs/yomdle_korean/v1/local/score.sh create mode 100755 egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh create mode 100755 egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh create mode 100755 egs/yomdle_korean/v1/local/semisup/process_data.py create mode 100755 egs/yomdle_korean/v1/local/semisup/run_semisup.sh create mode 100755 egs/yomdle_korean/v1/local/train_lm.sh create mode 100755 egs/yomdle_korean/v1/local/wer_output_filter create mode 120000 egs/yomdle_korean/v1/local/yomdle create mode 100755 egs/yomdle_korean/v1/path.sh create mode 100755 egs/yomdle_korean/v1/run_end2end.sh create mode 120000 egs/yomdle_korean/v1/steps create mode 120000 egs/yomdle_korean/v1/utils create mode 100644 egs/yomdle_tamil/README.txt diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/README.txt similarity index 100% rename from egs/ifnenit/v1/README.txt rename to egs/ifnenit/README.txt diff --git a/egs/madcat_ar/v1/README.txt b/egs/madcat_ar/README.txt similarity index 100% rename from egs/madcat_ar/v1/README.txt rename to egs/madcat_ar/README.txt diff --git a/egs/yomdle_korean/README.txt b/egs/yomdle_korean/README.txt new file mode 100644 index 00000000000..3bf4cc8cd2d --- /dev/null +++ b/egs/yomdle_korean/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various sources diff --git a/egs/yomdle_korean/v1/cmd.sh b/egs/yomdle_korean/v1/cmd.sh new file mode 100755 index 00000000000..3d69546dfe8 --- /dev/null +++ b/egs/yomdle_korean/v1/cmd.sh @@ -0,0 +1,12 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export cmd="queue.pl" diff --git a/egs/yomdle_korean/v1/image b/egs/yomdle_korean/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/yomdle_korean/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/local/augment_data.sh b/egs/yomdle_korean/v1/local/augment_data.sh new file mode 100755 index 00000000000..136bfd24eb2 --- /dev/null +++ b/egs/yomdle_korean/v1/local/augment_data.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ + --fliplr false --augment 'random_scale' $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/yomdle_korean/v1/local/chain/compare_wer.sh b/egs/yomdle_korean/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..80f31e0f311 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/compare_wer.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..fcf59f917c1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1b.sh \ No newline at end of file diff --git a/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..cea60a221a1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# score_basic score_nomalized +# WER 13.64 10.6 +# WER (rescored) 13.13 10.2 +# CER 2.99 3.0 +# CER (rescored) 2.88 2.9 +# Final train prob 0.0113 +# Final valid prob 0.0152 +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=48 nj=5..8 num-params=3.0M dim=40->352 combine=0.047->0.047 (over 2) logprob:train/valid[31,47,final]=(0.002,0.008,0.011/0.008,0.013,0.015) + +set -e +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_decode=data/lang +decode_e2e=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.apply-deriv-weights true \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 5 \ + --trainer.optimization.num-jobs-final 8 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..c43d7c669c1 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +# e2eali_1a is the same as 1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/old/chain/cnn_e2eali_1a/ +# System cnn_e2eali_1a +# WER 15.68 +# CER 3.18 +# Final train prob -0.0331 +# Final valid prob -0.0395 + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/ +# exp/old/chain/cnn_e2eali_1a/: num-iters=33 nj=3..16 num-params=5.2M dim=40->456 combine=-0.035->-0.035 (over 1) xent:train/valid[21,32,final]=(-0.226,-0.175,-0.169/-0.248,-0.202,-0.195) logprob:train/valid[21,32,final]=(-0.039,-0.034,-0.033/-0.046,-0.040,-0.039) + +# Normalize scoring +# WER = 11.7 +# CER = 3.3 + +set -e -o pipefail +stage=0 +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +decode_chain=false +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ] && $decode_chain; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ] && $decode_chain; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh $dir +fi diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..8fca9235f46 --- /dev/null +++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,208 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + +#local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ +# System cnn_e2eali_1b +# score_basic score_nomalized +# WER 13.01 10.0 +# WER (rescored) 12.69 9.6 +# CER 2.78 3.0 +# CER (rescored) 2.70 2.8 +# Final train prob -0.0568 +# Final valid prob -0.0410 +#steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +#exp/chain/cnn_e2eali_1b: num-iters=67 nj=3..16 num-params=5.2M dim=40->464 combine=-0.052->-0.052 (over 1) xent:train/valid[43,66,final]=(-0.379,-0.319,-0.304/-0.291,-0.234,-0.227) logprob:train/valid[43,66,final]=(-0.069,-0.058,-0.057/-0.046,-0.041,-0.041) +set -e -o pipefail +stage=0 +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=1000 +# we don't need extra left/right context for TDNN systems. +tdnn_dim=550 +# training options +srand=0 +remove_egs=false +lang_decode=data/lang +decode_chain=true +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=16 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi diff --git a/egs/yomdle_korean/v1/local/check_tools.sh b/egs/yomdle_korean/v1/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/yomdle_korean/v1/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/yomdle_korean/v1/local/extract_features.sh b/egs/yomdle_korean/v1/local/extract_features.sh new file mode 100755 index 00000000000..3880ebad3e8 --- /dev/null +++ b/egs/yomdle_korean/v1/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment='no_aug' +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/yomdle_korean/v1/local/normalize_data.py b/egs/yomdle_korean/v1/local/normalize_data.py new file mode 100755 index 00000000000..fba3e762789 --- /dev/null +++ b/egs/yomdle_korean/v1/local/normalize_data.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string +import unicodedata +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + words = line.strip().split() + uttid = words[0] + transcript = ' '.join(words[1:]) + text_normalized = unicodedata.normalize('NFC', transcript) + output.write(uttid + ' ' + text_normalized + '\n') diff --git a/egs/yomdle_korean/v1/local/prepare_dict.sh b/egs/yomdle_korean/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..22db5ae834d --- /dev/null +++ b/egs/yomdle_korean/v1/local/prepare_dict.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Babak Rekabdar +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +. ./utils/parse_options.sh || exit 1; + +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/yomdle_korean/v1/local/prepare_lexicon.py b/egs/yomdle_korean/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..ec8d43d8335 --- /dev/null +++ b/egs/yomdle_korean/v1/local/prepare_lexicon.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Babak Rekabdar +# 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text. +# Since this lexicon is based on BPE, it replaces '|' with silence. + +import argparse +import os +import unicodedata +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + char_normalized = unicodedata.normalize('NFD', line_vect[i]).replace('\n', '') + characters = list(char_normalized) + characters = " ".join([ 'SIL' if char == '|' else char for char in characters]) + characters = list(characters) + characters = "".join([ '' if char == '#' else char for char in characters]) + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/yomdle_korean/v1/local/process_corpus.py b/egs/yomdle_korean/v1/local/process_corpus.py new file mode 100755 index 00000000000..b39030270b7 --- /dev/null +++ b/egs/yomdle_korean/v1/local/process_corpus.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora +# Apache 2.0 +# This script reads valid phones and removes the lines in the corpus +# which have any other phone. + +import os +import sys, io + +phone_file = os.path.join('data/local/text/cleaned/phones.txt') +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +phone_dict = dict() +with open(phone_file, 'r', encoding='utf-8') as phone_fh: + for line in phone_fh: + line = line.strip().split()[0] + phone_dict[line] = line + +phone_dict[' '] = ' ' +corpus_text = list() +for line in infile: + text = line.strip() + skip_text = False + for phone in text: + if phone not in phone_dict.keys(): + skip_text = True + break + if not skip_text: + output.write(text+ '\n') + diff --git a/egs/yomdle_korean/v1/local/process_data.py b/egs/yomdle_korean/v1/local/process_data.py new file mode 100755 index 00000000000..d7546b0a803 --- /dev/null +++ b/egs/yomdle_korean/v1/local/process_data.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the extracted Tamil OCR (yomdle and slam) database files + and creates the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train + + Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that + utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" + +import argparse +import os +import sys +import csv +import itertools +import unicodedata +import re +import string +import unicodedata +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('data_split', type=str, help='Path to file that contain datasplits') +parser.add_argument('out_dir', type=str, help='directory to output files') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +with open(args.data_split) as f: + for line in f: + line = line.strip() + image_id = line + image_filename = image_id + '.png' + image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename) + if not os.path.isfile (image_filepath): + print("File does not exist {}".format(image_filepath)) + continue + line_id = int(line.split('_')[-1]) + csv_filename = '_'.join(line.split('_')[:-1]) + '.csv' + csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + for row in csv.reader(csv_file): + if row[1] == image_filename: + text = row[11] + text_vect = text.split() # this is to avoid non-utf-8 spaces + text = " ".join(text_vect) + #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '') + if not text: + continue + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + '\n') diff --git a/egs/yomdle_korean/v1/local/score.sh b/egs/yomdle_korean/v1/local/score.sh new file mode 100755 index 00000000000..31564d25326 --- /dev/null +++ b/egs/yomdle_korean/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh new file mode 100755 index 00000000000..654880fcf59 --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh @@ -0,0 +1,327 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 +# This script is semi-supervised recipe with 25k line images of supervised data +# and 22k line images of unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_semisup.sh shows how to call this. + +# We use 3-gram LM trained on 5M lines of auxilary data. +# This script uses the same tree as that for the seed model. +# Unsupervised set: train_unsup (25k tamil line images) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ exp/semisup_100k/chain/tdnn_semisup_1a/ +# System cnn_e2eali_1b tdnn_semisup_1a +# WER 15.06 13.83 +# CER 3.15 2.83 +# Final train prob -0.0343 0.6103-0.0360 +# Final valid prob -0.0403 0.6054-0.0418 + +# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1a/ +# exp/semisup_100k/chain/tdnn_semisup_1a/: num-iters=58 nj=6..16 num-params=3.7M dim=40->456 combine=0.240->0.240 (over 1) + +# Normalize scoring +#WER = 10.4 +#CER = 2.9 + +set -u -e -o pipefail + +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=30 +test_nj=30 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +exp_root=exp/semisup_100k +chain_affix= # affix for chain dir +tdnn_affix=_semisup_1a # affix for semi-supervised chain system + +# Datasets-Expects supervised_set and unsupervised_set +supervised_set=train +unsupervised_set=train_unsup + +# Input seed system +sup_chain_dir=exp/chain/cnn_e2eali_1b # supervised chain system +sup_lat_dir=exp/chain/e2e_train_lats # Seed model options +sup_tree_dir=exp/chain/tree_e2e # tree directory for supervised chain system + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation +# Neural network opts +xent_regularize=0.1 +tdnn_dim=450 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + output name=output-0 input=output.affine + output name=output-1 input=output.affine + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_$supervised_set + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$cmd" \ + --left-tolerance 3 --right-tolerance 3 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --frames-overlap-per-eg 0 --constrained false \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 2000000 \ + --cmvn-opts "$cmvn_opts" \ + --generate-egs-scp true \ + data/${supervised_set} $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=340,300,200,100 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=6.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=3 # frame-tolerance for chain training + +unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_$unsupervised_set + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \ + --generate-egs-scp true $unsup_egs_opts \ + data/$unsupervised_set $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +chunk_width=340,300,200,100 +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --egs.chunk-width=$chunk_width \ + --cmd "$cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.apply-deriv-weights=true \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=0 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --trainer.frames-per-iter=2000000 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 5 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs false \ + --feat-dir data/$supervised_set \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph +fi + +if [ $stage -le 18 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi +exit 0; + diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh new file mode 100755 index 00000000000..eb688151665 --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh @@ -0,0 +1,325 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 +# This script is semi-supervised recipe with 25k line images of supervised data +# and 22k line images of unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_semisup.sh shows how to call this. + +# We use 3-gram LM trained on 5M lines of auxilary data. +# This script uses the same tree as that for the seed model. +# Unsupervised set: train_unsup (25k tamil line images) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# local/chain/compare_wer.sh exp/semisup_100k/chain/tdnn_semisup_1b/ +# System tdnn_semisup_1b +# score_basic score_normalized +# WER 13.73 10.2 +# WER (rescored) 12.80 9.4 +# CER 2.78 2.8 +# CER (rescored) 2.57 2.7 +# Final train prob 0.6138-0.0337 +# Final valid prob 0.6115-0.0399 + +# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1b/ +# exp/semisup_100k/chain/tdnn_semisup_1b/: num-iters=46 nj=6..16 num-params=5.7M dim=40->456 combine=0.239->0.239 (over 1) + +set -u -e -o pipefail +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=30 +test_nj=30 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} +exp_root=exp/semisup_100k +chain_affix= # affix for chain dir +tdnn_affix=_semisup_1b # affix for semi-supervised chain system + +# Datasets-Expects supervised_set and unsupervised_set +supervised_set=train +unsupervised_set=train_unsup + +# Input seed system +sup_chain_dir=exp/chain/cnn_e2eali_1b # supervised chain system +sup_lat_dir=exp/chain/e2e_train_lats # Seed model options +sup_tree_dir=exp/chain/tree_e2e # tree directory for supervised chain system + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation +# Neural network opts +xent_regularize=0.1 +tdnn_dim=550 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +dropout_schedule='0,0@0.20,0.2@0.50,0' +dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + output name=output-0 input=output.affine + output name=output-1 input=output.affine + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_$supervised_set + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$cmd" \ + --left-tolerance 3 --right-tolerance 3 \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --frames-overlap-per-eg 0 --constrained false \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 2000000 \ + --cmvn-opts "$cmvn_opts" \ + --generate-egs-scp true \ + data/${supervised_set} $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=340,300,200,100 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=6.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=3 # frame-tolerance for chain training + +unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_$unsupervised_set + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \ + --generate-egs-scp true $unsup_egs_opts \ + data/$unsupervised_set $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \ + --block-size 64 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +chunk_width=340,300,200,100 +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --egs.chunk-width=$chunk_width \ + --cmd "$cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.apply-deriv-weights=true \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \ + --trainer.srand=0 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --trainer.frames-per-iter=2000000 \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs 16 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs false \ + --feat-dir data/$supervised_set \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph +fi + +if [ $stage -le 18 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi +exit 0; + diff --git a/egs/yomdle_korean/v1/local/semisup/process_data.py b/egs/yomdle_korean/v1/local/semisup/process_data.py new file mode 100755 index 00000000000..94ad770ec2d --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/process_data.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the slam boxed Tamil OCR dataset and creates the following + files utt2spk, images.scp. Since boxed data do not have transcripts, it do not + creates text file. It is created as a separate script, because the data that + local/process_data.py is processing contains some empty transcripts which + should be removed or it will create bug while applying BPE. + + Eg. local/semisup/process_data.py data/download/ data/local/splits/train_unsup.txt + data/train_unsup + + Eg. utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" +import argparse +import os +import sys +import csv +import itertools +import unicodedata +import re +import string +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('data_split', type=str, help='Path to file that contain datasplits') +parser.add_argument('out_dir', type=str, help='directory to output files') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') + +with open(args.data_split) as f: + for line in f: + line = line.strip() + image_id = line + image_filename = image_id + '.png' + image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename) + if not os.path.isfile (image_filepath): + print("File does not exist {}".format(image_filepath)) + continue + line_id = int(line.split('_')[-1]) + csv_filename = '_'.join(line.split('_')[:-1]) + '.csv' + csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + for row in csv.reader(csv_file): + if row[1] == image_filename: + text = 'semisup' + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + '\n') diff --git a/egs/yomdle_korean/v1/local/semisup/run_semisup.sh b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh new file mode 100755 index 00000000000..5e20f50c99e --- /dev/null +++ b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2018 Ashish Arora +# Apache 2.0 + +# This script demonstrates semi-supervised training using 25k line images of +# supervised data and 22k line images of unsupervised data. +# We assume the supervised data is in data/train and unsupervised data +# is in data/train_unsup. +# For LM training, we use 5 million lines of tamil text. + +set -e +set -o pipefail +stage=0 +nj=30 +exp_root=exp/semisup_56k +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p data/train_unsup/data +if [ $stage -le 0 ]; then + echo "stage 0: Processing train unsupervised data...$(date)" + local/semisup/process_data.py data/download/ \ + data/local/splits/train_unsup.txt \ + data/train_unsup + image/fix_data_dir.sh data/train_unsup +fi + +if [ $stage -le 1 ]; then + echo "stage 1: Obtaining image groups. calling get_image2num_frames..." + image/get_image2num_frames.py --feat-dim 40 data/train_unsup + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup + echo "Extracting features and calling compute_cmvn_stats: $(date) " + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup + steps/compute_cmvn_stats.sh data/train_unsup || exit 1; + image/fix_data_dir.sh data/train_unsup +fi + +for f in data/train/utt2spk data/train_unsup/utt2spk \ + data/train/text; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1; + fi +done + +# Prepare semi-supervised train set +if [ $stage -le 1 ]; then + utils/combine_data.sh data/semisup100k_250k \ + data/train data/train_unsup || exit 1 +fi + +############################################################################### +# Semi-supervised training using 25k line images supervised data and +# 22k hours unsupervised data. We use tree, lattices +# and seed chain system from the previous stage. +############################################################################### +if [ $stage -le 2 ]; then + local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \ + --supervised-set train \ + --unsupervised-set train_unsup \ + --sup-chain-dir exp/chain/cnn_e2eali_1b_ep16_7cnn \ + --sup-lat-dir exp/chain/e2e_train_lats \ + --sup-tree-dir exp/chain/tree_e2e \ + --chain-affix "" \ + --tdnn-affix _semisup_ep16_7cnn \ + --stage 15 --train_stage 9 \ + --exp-root $exp_root || exit 1 +fi diff --git a/egs/yomdle_korean/v1/local/train_lm.sh b/egs/yomdle_korean/v1/local/train_lm.sh new file mode 100755 index 00000000000..c73c42fb7dc --- /dev/null +++ b/egs/yomdle_korean/v1/local/train_lm.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions and corpus text. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=6 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/local/text/cleaned/bpe_val.txt > ${dir}/data/text/dev.txt + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from train and corpus text + cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + #[perplexity = 22.0613098868] over 151116.0 words + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + #[perplexity = 23.4801171202] over 151116.0 words + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/yomdle_korean/v1/local/wer_output_filter b/egs/yomdle_korean/v1/local/wer_output_filter new file mode 100755 index 00000000000..59e364e0231 --- /dev/null +++ b/egs/yomdle_korean/v1/local/wer_output_filter @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian + +# Apache 2.0 +# This script converts a BPE-encoded text to normal text. It is used in scoring + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + output.write(uttid + ' ' + transcript + '\n') diff --git a/egs/yomdle_korean/v1/local/yomdle b/egs/yomdle_korean/v1/local/yomdle new file mode 120000 index 00000000000..2c4544c1399 --- /dev/null +++ b/egs/yomdle_korean/v1/local/yomdle @@ -0,0 +1 @@ +../../../yomdle_tamil/v1/local/yomdle/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/path.sh b/egs/yomdle_korean/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/yomdle_korean/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh new file mode 100755 index 00000000000..65f5beb4b08 --- /dev/null +++ b/egs/yomdle_korean/v1/run_end2end.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2018 Hossein Hadian +# Ashish Arora +# Jonathan Chang +# Apache 2.0 + +set -e +stage=0 +nj=30 + +language_main=Korean +slam_dir=/export/corpora5/slam/SLAM/ +yomdle_dir=/export/corpora5/slam/YOMDLE/ +corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ko/ +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +./local/check_tools.sh +# Start from stage=-2 for data preparation. This stage stores line images, +# csv files and splits{train,test,train_unsup} data/download/truth_line_image, +# data/download/truth_csv and data/local/splits respectively. +if [ $stage -le -2 ]; then + echo "$(date): preparing data, obtaining line images and csv files..." + local/yomdle/create_download_dir.sh --language_main $language_main \ + --slam_dir $slam_dir --yomdle_dir $yomdle_dir +fi + +if [ $stage -le -1 ]; then + echo "$(date): getting corpus text for language modelling..." + mkdir -p data/local/text/cleaned + cat $corpus_dir/* > data/local/text/ko.txt + head -20000 data/local/text/ko.txt > data/local/text/cleaned/val.txt + tail -n +20000 data/local/text/ko.txt > data/local/text/cleaned/corpus.txt +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 0 ]; then + echo "$0 stage 0: Processing train and test data.$(date)" + echo " creating text, images.scp, utt2spk and spk2utt" + #local/prepare_data.sh data/download/ + for set in train test; do + local/process_data.py data/download/ \ + data/local/splits/${set}.txt data/${set} + image/fix_data_dir.sh data/${set} + done +fi + +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in train test; do + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 3 ]; then + echo "$(date) stage 3: BPE preparation" + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; + +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/text/cleaned/phones.txt + + cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt + + echo "learning BPE..." + # it is currently learned with only training text but we can also use all corpus text + # to learn BPE. phones are added so that one isolated occurance of every phone exists. + cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$(date) stage 4: applying BPE..." + echo "applying BPE on train, test text..." + for set in test train; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "applying BPE to corpus text..." + cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt + cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \ + sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt +fi + +if [ $stage -le 5 ]; then + echo "$(date) stage 5: Preparing dictionary and lang..." + local/prepare_dict.sh --dir data/local/dict + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$(date) stage 6: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 7 ]; then + echo "$(date) stage 7: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +chunk_width='340,300,200,100' +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if [ $stage -le 8 ]; then + echo "$(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width +fi + +if [ $stage -le 9 ]; then + echo "$(date) stage 9: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 10 ] && $decode_e2e; then + echo "$(date) stage 10: decoding end2end setup..." + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1; + + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" --beam 12 \ + exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +fi + +if [ $stage -le 11 ] && $decode_chain; then + echo "$(date) stage 11: decoding chain alignment setup..." + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" --beam 12 \ + exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1 + + echo "Done. Date: $(date). Results:" + local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a +fi diff --git a/egs/yomdle_korean/v1/steps b/egs/yomdle_korean/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/yomdle_korean/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/yomdle_korean/v1/utils b/egs/yomdle_korean/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/yomdle_korean/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/yomdle_tamil/README.txt b/egs/yomdle_tamil/README.txt new file mode 100644 index 00000000000..0f295e5ae5f --- /dev/null +++ b/egs/yomdle_tamil/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various sources. diff --git a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py index 8f03be874e7..dd4bf536692 100755 --- a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py +++ b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py @@ -29,7 +29,8 @@ from scipy.spatial import ConvexHull from PIL import Image from scipy.misc import toimage - +from pathlib import Path +from glob import glob parser = argparse.ArgumentParser(description="Creates line images from page image") parser.add_argument('image_dir', type=str, help='Path to full page images') parser.add_argument('csv_dir', type=str, help='Path to csv files') @@ -321,10 +322,18 @@ def update_minimum_bounding_box_input(bounding_box_input): ### main ### globvar = 0 text_fh = open(args.output_file, 'w', encoding='utf-8') -for filename in sorted(os.listdir(args.csv_dir)): - with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f: - image_file = os.path.join(args.image_dir, filename.split('.')[0] + args.ext) - im = Image.open(image_file).convert('L') +file_list = list(Path(args.csv_dir).rglob("*.[cC][sS][vV]")) +for filename in sorted(file_list): + filename = str(filename) + with open(str(filename), 'r', encoding='utf-8') as f: + base_name = os.path.basename(filename) + image_file = os.path.join(args.image_dir, base_name.split('.')[0] + args.ext) + try: + im = Image.open(image_file).convert('L') + except Exception as e: + print("Error: No such Image " + row[1]) + globvar += 1 + continue im = pad_image(im) for row in itertools.islice(csv.reader(f), 1, None): points = []