#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # # LASER Language-Agnostic SEntence Representations # is a toolkit to calculate multilingual sentence embeddings # and to use them for document classification, bitext filtering # and mining # # -------------------------------------------------------- # # bash script to mine for bitexts in the BUCC corpus if [ -z ${LASER+x} ] ; then echo "Please set the environment variable 'LASER'" exit fi # general config bucc="bucc2018" data="." xdir=${data}/downloaded # tar files as distrubuted by the BUCC evaluation ddir=${data}/${bucc} # raw texts of BUCC edir=${data}/embed # normalized texts and embeddings langs=("fr" "de" "ru" "zh") ltrg="en" # English is always the 2nd language # encoder model_dir="${LASER}/models" encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" bpe_codes="${model_dir}/93langs.fcodes" ################################################################### # # Extract files with labels and texts from the BUCC corpus # ################################################################### GetData () { fn1=$1; fn2=$2; lang=$3 outf="${edir}/${bucc}.${lang}-${ltrg}.${fn2}" for ll in ${ltrg} ${lang} ; do inf="${ddir}/${fn1}.${ll}" if [ ! -f ${outf}.txt.${ll} ] ; then echo " - extract files ${outf} in ${ll}" cat ${inf} | cut -f1 > ${outf}.id.${ll} cat ${inf} | cut -f2 > ${outf}.txt.${ll} fi done } ExtractBUCC () { slang=$1 tlang=${ltrg} pushd ${data} > /dev/null if [ ! -d ${ddir}/${slang}-${tlang} ] ; then for tf in ${xdir}/${bucc}-${slang}-${tlang}.*.tar.bz2 ; do echo " - extract from tar `basename ${tf}`" tar jxf $tf done fi GetData "${slang}-${tlang}/${slang}-${tlang}.sample" "dev" ${slang} GetData "${slang}-${tlang}/${slang}-${tlang}.training" "train" ${slang} GetData "${slang}-${tlang}/${slang}-${tlang}.test" "test" ${slang} popd > /dev/null } ################################################################### # # Tokenize and Embed # ################################################################### Embed () { ll=$2 txt="$1.txt.${ll}" enc="$1.enc.${ll}" if [ ! -s ${enc} ] ; then cat ${txt} | python3 ${LASER}/source/embed.py \ --encoder ${encoder} \ --token-lang ${ll} \ --bpe-codes ${bpe_codes} \ --output ${enc} \ --verbose fi } ################################################################### # # Mine for bitexts # ################################################################### Mine () { bn=$1 l1=$2 l2=$3 cand="${bn}.candidates.tsv" if [ ! -s ${cand} ] ; then python3 ${LASER}/source/mine_bitexts.py \ ${bn}.txt.${l1} ${bn}.txt.${l2} \ --src-lang ${l1} --trg-lang ${l2} \ --src-embeddings ${bn}.enc.${l1} --trg-embeddings ${bn}.enc.${l2} \ --unify --mode mine --retrieval max --margin ratio -k 4 \ --output ${cand} \ --verbose --gpu fi } ################################################################### # # Main loop # ################################################################### echo -e "\nProcessing BUCC data in ${data}" # create output directories for d in ${ddir} ${edir} ; do mkdir -p ${d} done for lsrc in ${langs[@]} ; do ExtractBUCC ${lsrc} # Tokenize and embed train bname="${bucc}.${lsrc}-${ltrg}" part="${bname}.train" Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes} Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes} # mine for texts in train Mine ${edir}/${part} ${lsrc} ${ltrg} # optimize threshold on BUCC training data and provided gold alignments if [ ! -s ${part}.log ] ; then python3 bucc.py \ --src-lang ${lsrc} --trg-lang ${ltrg} \ --bucc-texts ${edir}/${part}.txt \ --bucc-ids ${edir}/${part}.id \ --candidates ${edir}/${part}.candidates.tsv \ --gold ${ddir}/${lsrc}-${ltrg}/${lsrc}-${ltrg}.training.gold \ --verbose \ | tee ${part}.log fi # Tokenize and embed test part="${bname}.test" Embed ${edir}/${part} ${lsrc} ${encoder} ${bpe_codes} Embed ${edir}/${part} ${ltrg} ${encoder} ${bpe_codes} # mine for texts in test Mine ${edir}/${part} ${lsrc} ${ltrg} # extract test bitexts for treshhold optimized on train th=`grep 'best threshold' ${bname}.train.log | sed -e 's/[=:]/ /g' | awk '{print $4}'` extracted="${edir}/${part}.extracted.tsv" if [ ! -s ${extracted} ] ; then python3 bucc.py \ --src-lang ${lsrc} --trg-lang ${ltrg} \ --bucc-texts ${edir}/${part}.txt \ --bucc-ids ${edir}/${part}.id \ --candidates ${edir}/${part}.candidates.tsv \ --threshold ${th} --output ${extracted} \ --verbose fi done # Bonus: extract bitexts with English alignments # using a (conservative) threshold of 1.1 # All the data is supposed to be already tokenized th=1.1 for lsrc in ${langs[@]} ; do for ltrg in ${langs[@]} ; do if [ ${lsrc} != 'en' -a ${ltrg} != "en" -a ${lsrc} != ${ltrg} ] ; then bitext="${bucc}.${lsrc}-${ltrg}.train.extracted.th${th}.csv" if [ ! -s ${bitext} ] ; then echo "Extracting bitexts for ${lsrc}-${ltrg}" python3 ${LASER}/source/mine_bitexts.py \ ${edir}/${bucc}.${lsrc}-en.train.txt.${lsrc} \ ${edir}/${bucc}.${ltrg}-en.train.txt.${ltrg} \ --src-lang ${lsrc} --trg-lang ${ltrg} \ --src-embeddings ${edir}/${bucc}.${lsrc}-en.train.enc.${lsrc} \ --trg-embeddings ${edir}/${bucc}.${ltrg}-en.train.enc.${ltrg} \ --unify --mode mine --retrieval max --margin ratio -k 4 \ --output ${bitext} --threshold ${th} \ --verbose --gpu fi fi done done