From 6e71c76ba99b22cde395d9daf052a1c8af8a923b Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Wed, 26 Dec 2018 14:32:26 -0500
Subject: [PATCH] adding yomdle korean setup

---
 egs/ifnenit/{v1 => }/README.txt               |   0
 egs/madcat_ar/{v1 => }/README.txt             |   0
 egs/yomdle_korean/README.txt                  |   3 +
 egs/yomdle_korean/v1/cmd.sh                   |  12 +
 egs/yomdle_korean/v1/image                    |   1 +
 egs/yomdle_korean/v1/local/augment_data.sh    |  36 ++
 .../v1/local/chain/compare_wer.sh             |  66 ++++
 .../v1/local/chain/run_cnn_e2eali.sh          |   1 +
 .../v1/local/chain/run_e2e_cnn.sh             | 132 +++++++
 .../local/chain/tuning/run_cnn_e2eali_1a.sh   | 236 +++++++++++++
 .../local/chain/tuning/run_cnn_e2eali_1b.sh   | 208 +++++++++++
 egs/yomdle_korean/v1/local/check_tools.sh     |  43 +++
 .../v1/local/extract_features.sh              |  48 +++
 egs/yomdle_korean/v1/local/normalize_data.py  |  18 +
 egs/yomdle_korean/v1/local/prepare_dict.sh    |  26 ++
 egs/yomdle_korean/v1/local/prepare_lexicon.py |  35 ++
 egs/yomdle_korean/v1/local/process_corpus.py  |  30 ++
 egs/yomdle_korean/v1/local/process_data.py    |  65 ++++
 egs/yomdle_korean/v1/local/score.sh           |   5 +
 .../run_cnn_chainali_semisupervised_1a.sh     | 327 ++++++++++++++++++
 .../run_cnn_chainali_semisupervised_1b.sh     | 325 +++++++++++++++++
 .../v1/local/semisup/process_data.py          |  61 ++++
 .../v1/local/semisup/run_semisup.sh           |  71 ++++
 egs/yomdle_korean/v1/local/train_lm.sh        | 127 +++++++
 egs/yomdle_korean/v1/local/wer_output_filter  |  17 +
 egs/yomdle_korean/v1/local/yomdle             |   1 +
 egs/yomdle_korean/v1/path.sh                  |   6 +
 egs/yomdle_korean/v1/run_end2end.sh           | 186 ++++++++++
 egs/yomdle_korean/v1/steps                    |   1 +
 egs/yomdle_korean/v1/utils                    |   1 +
 egs/yomdle_tamil/README.txt                   |   3 +
 .../create_line_image_from_page_image.py      |  19 +-
 32 files changed, 2105 insertions(+), 5 deletions(-)
 rename egs/ifnenit/{v1 => }/README.txt (100%)
 rename egs/madcat_ar/{v1 => }/README.txt (100%)
 create mode 100644 egs/yomdle_korean/README.txt
 create mode 100755 egs/yomdle_korean/v1/cmd.sh
 create mode 120000 egs/yomdle_korean/v1/image
 create mode 100755 egs/yomdle_korean/v1/local/augment_data.sh
 create mode 100755 egs/yomdle_korean/v1/local/chain/compare_wer.sh
 create mode 120000 egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh
 create mode 100755 egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh
 create mode 100755 egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
 create mode 100755 egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
 create mode 100755 egs/yomdle_korean/v1/local/check_tools.sh
 create mode 100755 egs/yomdle_korean/v1/local/extract_features.sh
 create mode 100755 egs/yomdle_korean/v1/local/normalize_data.py
 create mode 100755 egs/yomdle_korean/v1/local/prepare_dict.sh
 create mode 100755 egs/yomdle_korean/v1/local/prepare_lexicon.py
 create mode 100755 egs/yomdle_korean/v1/local/process_corpus.py
 create mode 100755 egs/yomdle_korean/v1/local/process_data.py
 create mode 100755 egs/yomdle_korean/v1/local/score.sh
 create mode 100755 egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
 create mode 100755 egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
 create mode 100755 egs/yomdle_korean/v1/local/semisup/process_data.py
 create mode 100755 egs/yomdle_korean/v1/local/semisup/run_semisup.sh
 create mode 100755 egs/yomdle_korean/v1/local/train_lm.sh
 create mode 100755 egs/yomdle_korean/v1/local/wer_output_filter
 create mode 120000 egs/yomdle_korean/v1/local/yomdle
 create mode 100755 egs/yomdle_korean/v1/path.sh
 create mode 100755 egs/yomdle_korean/v1/run_end2end.sh
 create mode 120000 egs/yomdle_korean/v1/steps
 create mode 120000 egs/yomdle_korean/v1/utils
 create mode 100644 egs/yomdle_tamil/README.txt

diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/README.txt
similarity index 100%
rename from egs/ifnenit/v1/README.txt
rename to egs/ifnenit/README.txt
diff --git a/egs/madcat_ar/v1/README.txt b/egs/madcat_ar/README.txt
similarity index 100%
rename from egs/madcat_ar/v1/README.txt
rename to egs/madcat_ar/README.txt
diff --git a/egs/yomdle_korean/README.txt b/egs/yomdle_korean/README.txt
new file mode 100644
index 00000000000..3bf4cc8cd2d
--- /dev/null
+++ b/egs/yomdle_korean/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_korean/v1/cmd.sh b/egs/yomdle_korean/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_korean/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_korean/v1/image b/egs/yomdle_korean/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_korean/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/local/augment_data.sh b/egs/yomdle_korean/v1/local/augment_data.sh
new file mode 100755
index 00000000000..136bfd24eb2
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/augment_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr false --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_korean/v1/local/chain/compare_wer.sh b/egs/yomdle_korean/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..fcf59f917c1
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..cea60a221a1
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                          e2e_cnn_1a
+#                                 score_basic  score_nomalized
+# WER                             13.64        10.6
+# WER (rescored)                  13.13        10.2
+# CER                              2.99         3.0
+# CER (rescored)                   2.88         2.9
+# Final train prob               0.0113
+# Final valid prob               0.0152
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=48 nj=5..8 num-params=3.0M dim=40->352 combine=0.047->0.047 (over 2) logprob:train/valid[31,47,final]=(0.002,0.008,0.011/0.008,0.013,0.015)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_decode=data/lang
+decode_e2e=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..c43d7c669c1
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# e2eali_1a is the same as 1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh exp/old/chain/cnn_e2eali_1a/
+# System                      cnn_e2eali_1a
+# WER                             15.68
+# CER                              3.18
+# Final train prob              -0.0331
+# Final valid prob              -0.0395
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/
+# exp/old/chain/cnn_e2eali_1a/: num-iters=33 nj=3..16 num-params=5.2M dim=40->456 combine=-0.035->-0.035 (over 1) xent:train/valid[21,32,final]=(-0.226,-0.175,-0.169/-0.248,-0.202,-0.195) logprob:train/valid[21,32,final]=(-0.039,-0.034,-0.033/-0.046,-0.040,-0.039)
+
+# Normalize scoring
+# WER = 11.7
+# CER = 3.3
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_chain=false
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_chain; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ] && $decode_chain; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --beam 12 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh $dir
+fi
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..8fca9235f46
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+# e2eali_1b is the same as e2eali_1a but has fewer CNN layers, smaller
+# l2-regularize, more epochs and uses dropout.
+
+#local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/
+# System                      cnn_e2eali_1b
+#                                 score_basic  score_nomalized
+# WER                             13.01        10.0
+# WER (rescored)                  12.69         9.6
+# CER                              2.78         3.0
+# CER (rescored)                   2.70         2.8
+# Final train prob              -0.0568
+# Final valid prob              -0.0410
+#steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
+#exp/chain/cnn_e2eali_1b: num-iters=67 nj=3..16 num-params=5.2M dim=40->464 combine=-0.052->-0.052 (over 1) xent:train/valid[43,66,final]=(-0.379,-0.319,-0.304/-0.291,-0.234,-0.227) logprob:train/valid[43,66,final]=(-0.069,-0.058,-0.057/-0.046,-0.041,-0.041)
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+decode_chain=true
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
diff --git a/egs/yomdle_korean/v1/local/check_tools.sh b/egs/yomdle_korean/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_korean/v1/local/extract_features.sh b/egs/yomdle_korean/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_korean/v1/local/normalize_data.py b/egs/yomdle_korean/v1/local/normalize_data.py
new file mode 100755
index 00000000000..fba3e762789
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/normalize_data.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+import unicodedata
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  text_normalized = unicodedata.normalize('NFC', transcript)
+  output.write(uttid + ' ' + text_normalized + '\n')
diff --git a/egs/yomdle_korean/v1/local/prepare_dict.sh b/egs/yomdle_korean/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_korean/v1/local/prepare_lexicon.py b/egs/yomdle_korean/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..ec8d43d8335
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/prepare_lexicon.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+import unicodedata
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            char_normalized = unicodedata.normalize('NFD', line_vect[i]).replace('\n', '')
+            characters = list(char_normalized)
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_korean/v1/local/process_corpus.py b/egs/yomdle_korean/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_korean/v1/local/process_data.py b/egs/yomdle_korean/v1/local/process_data.py
new file mode 100755
index 00000000000..d7546b0a803
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/process_data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+import unicodedata
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                text_vect = text.split() # this is to avoid non-utf-8 spaces
+                text = " ".join(text_vect)
+                #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '')
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_korean/v1/local/score.sh b/egs/yomdle_korean/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
new file mode 100755
index 00000000000..654880fcf59
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ exp/semisup_100k/chain/tdnn_semisup_1a/
+# System                      cnn_e2eali_1b tdnn_semisup_1a
+# WER                             15.06     13.83
+# CER                              3.15      2.83
+# Final train prob              -0.0343    0.6103-0.0360
+# Final valid prob              -0.0403    0.6054-0.0418
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1a/
+# exp/semisup_100k/chain/tdnn_semisup_1a/: num-iters=58 nj=6..16 num-params=3.7M dim=40->456 combine=0.240->0.240 (over 1)
+
+# Normalize scoring
+#WER = 10.4
+#CER = 2.9
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1a  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=450
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=4
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 5 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
new file mode 100755
index 00000000000..eb688151665
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -0,0 +1,325 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/semisup_100k/chain/tdnn_semisup_1b/
+# System                      tdnn_semisup_1b
+#                                 score_basic    score_normalized
+# WER                             13.73          10.2
+# WER (rescored)                  12.80           9.4
+# CER                              2.78           2.8
+# CER (rescored)                   2.57           2.7
+# Final train prob           0.6138-0.0337
+# Final valid prob           0.6115-0.0399
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1b/
+# exp/semisup_100k/chain/tdnn_semisup_1b/: num-iters=46 nj=6..16 num-params=5.7M dim=40->456 combine=0.239->0.239 (over 1)
+
+set -u -e -o pipefail
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1b  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=550
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=5
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_korean/v1/local/semisup/process_data.py b/egs/yomdle_korean/v1/local/semisup/process_data.py
new file mode 100755
index 00000000000..94ad770ec2d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/process_data.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the slam boxed Tamil OCR dataset and creates the following
+    files utt2spk, images.scp. Since boxed data do not have transcripts, it do not
+    creates text file. It is created as a separate script, because the data that
+    local/process_data.py is processing contains some empty transcripts which 
+    should be removed or it will create bug while applying BPE.
+
+  Eg. local/semisup/process_data.py data/download/ data/local/splits/train_unsup.txt
+        data/train_unsup
+
+  Eg. utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = 'semisup'
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_korean/v1/local/semisup/run_semisup.sh b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh
new file mode 100755
index 00000000000..5e20f50c99e
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using 25k line images of 
+# supervised data and 22k line images of unsupervised data.
+# We assume the supervised data is in data/train and unsupervised data
+# is in data/train_unsup. 
+# For LM training, we use 5 million lines of tamil text.
+
+set -e
+set -o pipefail
+stage=0
+nj=30
+exp_root=exp/semisup_56k
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p data/train_unsup/data
+if [ $stage -le 0 ]; then
+  echo "stage 0: Processing train unsupervised data...$(date)"
+  local/semisup/process_data.py data/download/ \
+    data/local/splits/train_unsup.txt \
+    data/train_unsup
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+if [ $stage -le 1 ]; then
+  echo "stage 1: Obtaining image groups. calling get_image2num_frames..."
+  image/get_image2num_frames.py --feat-dim 40 data/train_unsup
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup
+  echo "Extracting features and calling compute_cmvn_stats: $(date) "
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup
+  steps/compute_cmvn_stats.sh data/train_unsup || exit 1;
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+for f in data/train/utt2spk data/train_unsup/utt2spk \
+  data/train/text; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find $f"
+    exit 1;
+  fi
+done
+
+# Prepare semi-supervised train set 
+if [ $stage -le 1 ]; then
+  utils/combine_data.sh data/semisup100k_250k \
+    data/train data/train_unsup || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using 25k line images supervised data and 
+# 22k hours unsupervised data. We use tree, lattices 
+# and seed chain system from the previous stage.
+###############################################################################
+if [ $stage -le 2 ]; then
+  local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \
+    --supervised-set train \
+    --unsupervised-set train_unsup \
+    --sup-chain-dir exp/chain/cnn_e2eali_1b_ep16_7cnn \
+    --sup-lat-dir exp/chain/e2e_train_lats \
+    --sup-tree-dir exp/chain/tree_e2e \
+    --chain-affix "" \
+    --tdnn-affix _semisup_ep16_7cnn \
+    --stage 15 --train_stage 9 \
+    --exp-root $exp_root || exit 1
+fi
diff --git a/egs/yomdle_korean/v1/local/train_lm.sh b/egs/yomdle_korean/v1/local/train_lm.sh
new file mode 100755
index 00000000000..c73c42fb7dc
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/train_lm.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_korean/v1/local/wer_output_filter b/egs/yomdle_korean/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_korean/v1/local/yomdle b/egs/yomdle_korean/v1/local/yomdle
new file mode 120000
index 00000000000..2c4544c1399
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/yomdle
@@ -0,0 +1 @@
+../../../yomdle_tamil/v1/local/yomdle/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/path.sh b/egs/yomdle_korean/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_korean/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh
new file mode 100755
index 00000000000..65f5beb4b08
--- /dev/null
+++ b/egs/yomdle_korean/v1/run_end2end.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Korean
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ko/
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ko.txt
+  head -20000 data/local/text/ko.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/ko.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$0 stage 0: Processing train and test data.$(date)"
+  echo " creating text, images.scp, utt2spk and spk2utt"
+  #local/prepare_data.sh data/download/
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$(date) stage 4: applying BPE..."
+  echo "applying BPE on train, test text..."
+  for set in test train; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$(date) stage 6: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$(date) stage 7: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+chunk_width='340,300,200,100'
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if [ $stage -le 8 ]; then
+  echo "$(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$(date) stage 9: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 10 ] && $decode_e2e; then
+  echo "$(date) stage 10: decoding end2end setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
+
+if [ $stage -le 11 ] && $decode_chain; then
+  echo "$(date) stage 11: decoding chain alignment setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+fi
diff --git a/egs/yomdle_korean/v1/steps b/egs/yomdle_korean/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_korean/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/utils b/egs/yomdle_korean/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_korean/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_tamil/README.txt b/egs/yomdle_tamil/README.txt
new file mode 100644
index 00000000000..0f295e5ae5f
--- /dev/null
+++ b/egs/yomdle_tamil/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources.
diff --git a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py
index 8f03be874e7..dd4bf536692 100755
--- a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py
+++ b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py
@@ -29,7 +29,8 @@
 from scipy.spatial import ConvexHull
 from PIL import Image
 from scipy.misc import toimage
-
+from pathlib import Path
+from glob import glob
 parser = argparse.ArgumentParser(description="Creates line images from page image")
 parser.add_argument('image_dir', type=str, help='Path to full page images')
 parser.add_argument('csv_dir', type=str, help='Path to csv files')
@@ -321,10 +322,18 @@ def update_minimum_bounding_box_input(bounding_box_input):
 ### main ###
 globvar = 0
 text_fh = open(args.output_file, 'w', encoding='utf-8')
-for filename in sorted(os.listdir(args.csv_dir)):
-    with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f:
-        image_file = os.path.join(args.image_dir, filename.split('.')[0] + args.ext)
-        im = Image.open(image_file).convert('L')
+file_list = list(Path(args.csv_dir).rglob("*.[cC][sS][vV]"))
+for filename in sorted(file_list):
+    filename = str(filename)
+    with open(str(filename), 'r', encoding='utf-8') as f:
+        base_name = os.path.basename(filename)
+        image_file = os.path.join(args.image_dir, base_name.split('.')[0] + args.ext)
+        try:
+            im = Image.open(image_file).convert('L')
+        except Exception as e:
+            print("Error: No such Image " + row[1])
+            globvar += 1
+            continue
         im = pad_image(im)
         for row in itertools.islice(csv.reader(f), 1, None):
             points = []