JST_LOGO.JPG NICT_LOGO.JPG KYOTO-U_LOGO.JPG

WAT 2017

The 4th Workshop on Asian Translation
Baseline Systems
Tree-to-String SMT
for the EJ and CJ subtasks

[BASELINE SYSTEMS TOP] | [SETUP] | [TRAINING LANGUAGE MODEL] | [TRAINING TRANSLATION MODEL] | [TUNING] | [TRANSLATING] | [DETOKENIZE THE OUTPUT]

Setup

(Here, ${LANG_F} represents the source language and ${LANG_E} represents the target language. "en" and "ja" are samples.)
LANG_F=en
LANG_E=ja
CORPUS_LM=../corpus.tok/train
(For EJ of ASPEC, "CORPUS_LM=../corpus.tok/train-all")
CORPUS=../corpus.tree/train-clean
DEV_F=../corpus.tree/dev.${LANG_F}
DEV_E=../corpus.tok/dev.${LANG_E}
TEST=../corpus.tree/test.${LANG_F}
REF=../corpus.tok/test.${LANG_E}
LM_ORDER=5
JOBS=16

MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
MOSES_BIN_DIR=${path}/mosesdecoder-RELEASE-2.1.1/bin
EXT_BIN_DIR=${path}/giza-pp/bin

WORK_DIR=work.${LANG_F}-${LANG_E}
TRAINING_DIR=${WORK_DIR}/training
MODEL_DIR=${WORK_DIR}/training/model

mkdir tree2stringModel
cd tree2stringModel/
mkdir -p ${TRAINING_DIR}/lm
Back to top

Training Language Model

LM_FILE=`pwd`/${TRAINING_DIR}/lm/lm.${LANG_E}.arpa.gz

${MOSES_BIN_DIR}/lmplz --order ${LM_ORDER} -S 80% -T /tmp < ${CORPUS_LM}.${LANG_E} | gzip > ${LM_FILE}
Back to top

Training Translation Model

${MOSES_SCRIPT}/training/train-model.perl \
  --root-dir `pwd`/${TRAINING_DIR} \
  --model-dir `pwd`/${MODEL_DIR} \
  --corpus ${CORPUS} \
  --external-bin-dir ${EXT_BIN_DIR} \
  --f ${LANG_F} \
  --e ${LANG_E} \
  --parallel \
  --alignment grow-diag-final-and \
  --score-options "--GoodTuring" \
  --hierarchical \
  --glue-grammar \
  --lm 0:${LM_ORDER}:${LM_FILE}:8 \
  --source-syntax \
  --extract-options "--MaxSpan 1000 --MinHoleSource 1 --MinWords 0 --NonTermConsecSource --AllowOnlyUnalignedWords" \
  --cores ${JOBS} \
  --sort-buffer-size 10G \
  --parallel \
  >& ${TRAINING_DIR}/training_TM.log
Back to top

Tuning

mkdir -p ${WORK_DIR}/tuning

${MOSES_SCRIPT}/training/mert-moses.pl \
  ${DEV_F} \
  ${DEV_E} \
  ${MOSES_BIN_DIR}/moses_chart \
  `pwd`/${MODEL_DIR}/moses.ini \
  --mertdir ${MOSES_BIN_DIR} \
  --working-dir `pwd`/${WORK_DIR}/tuning/mert \
  --threads ${JOBS} \
  --no-filter-phrase-table \
  --decoder-flags "-threads ${JOBS} -max-chart-span 1000" \
  --inputtype 3 \
  --predictable-seeds \
  >& ${WORK_DIR}/tuning/mert.log


  • Insert weights into the configuration file.
  • perl ${MOSES_SCRIPT}/ems/support/substitute-weights.perl \
      ${MODEL_DIR}/moses.ini \
      ${WORK_DIR}/tuning/mert/moses.ini \
      ${MODEL_DIR}/moses-tuned.ini

    Back to top

    Translating

    OUTPUT_DIR=${WORK_DIR}/output
    mkdir ${OUTPUT_DIR}
    outfile=${OUTPUT_DIR}/test.out

    ${MOSES_BIN_DIR}/moses_chart -config ${MODEL_DIR}/moses-tuned.ini -max-chart-span 1000 -threads ${JOBS} -inputtype 3 < ${TEST} > ${outfile} 2> ${outfile}.log

    Back to top

    Recase the output

  • For Indonesian
  • ${MOSES_SCRIPT}/recaser/detruecase.perl < ${outfile} > ${outfile}.tok
    Back to top

    Detokenize the output

  • For Japanese
  • cat ${outfile} | \
      perl -Mencoding=utf8 -pe 's/([^A-Za-zA-Za-z]) +/${1}/g; s/ +([^A-Za-zA-Za-z])/${1}/g; ' \
      > ${outfile}.detok

  • For Indonesian
  • ${MOSES_SCRIPT}/tokenizer/detokenizer.perl -l en < ${outfile}.tok > ${outfile}.detok

    Back to top

    JST (Japan Science and Technology Agency)
    NICT (National Institute of Information and Communications Technology)
    Kyoto University
    Last Modified: 2017-07-21