Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare-iwslt14.sh 2.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  1. #!/usr/bin/env bash
  2. #
  3. # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
  4. echo 'Cloning Moses github repository (for tokenization scripts)...'
  5. git clone https://github.com/moses-smt/mosesdecoder.git
  6. echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
  7. git clone https://github.com/rsennrich/subword-nmt.git
  8. SCRIPTS=mosesdecoder/scripts
  9. TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
  10. LC=$SCRIPTS/tokenizer/lowercase.perl
  11. CLEAN=$SCRIPTS/training/clean-corpus-n.perl
  12. BPEROOT=subword-nmt
  13. BPE_TOKENS=10000
  14. URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
  15. GZ=de-en.tgz
  16. if [ ! -d "$SCRIPTS" ]; then
  17. echo "Please set SCRIPTS variable correctly to point to Moses scripts."
  18. exit
  19. fi
  20. src=de
  21. tgt=en
  22. lang=de-en
  23. prep=iwslt14.tokenized.de-en
  24. tmp=$prep/tmp
  25. orig=orig
  26. mkdir -p $orig $tmp $prep
  27. echo "Downloading data from ${URL}..."
  28. cd $orig
  29. wget "$URL"
  30. if [ -f $GZ ]; then
  31. echo "Data successfully downloaded."
  32. else
  33. echo "Data not successfully downloaded."
  34. exit
  35. fi
  36. tar zxvf $GZ
  37. cd ..
  38. echo "pre-processing train data..."
  39. for l in $src $tgt; do
  40. f=train.tags.$lang.$l
  41. tok=train.tags.$lang.tok.$l
  42. cat $orig/$lang/$f | \
  43. grep -v '<url>' | \
  44. grep -v '<talkid>' | \
  45. grep -v '<keywords>' | \
  46. sed -e 's/<title>//g' | \
  47. sed -e 's/<\/title>//g' | \
  48. sed -e 's/<description>//g' | \
  49. sed -e 's/<\/description>//g' | \
  50. perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
  51. echo ""
  52. done
  53. perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
  54. for l in $src $tgt; do
  55. perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
  56. done
  57. echo "pre-processing valid/test data..."
  58. for l in $src $tgt; do
  59. for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
  60. fname=${o##*/}
  61. f=$tmp/${fname%.*}
  62. echo $o $f
  63. grep '<seg id' $o | \
  64. sed -e 's/<seg id="[0-9]*">\s*//g' | \
  65. sed -e 's/\s*<\/seg>\s*//g' | \
  66. sed -e "s/\’/\'/g" | \
  67. perl $TOKENIZER -threads 8 -l $l | \
  68. perl $LC > $f
  69. echo ""
  70. done
  71. done
  72. echo "creating train, valid, test..."
  73. for l in $src $tgt; do
  74. awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
  75. awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
  76. cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
  77. $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
  78. $tmp/IWSLT14.TED.tst2010.de-en.$l \
  79. $tmp/IWSLT14.TED.tst2011.de-en.$l \
  80. $tmp/IWSLT14.TED.tst2012.de-en.$l \
  81. > $tmp/test.$l
  82. done
  83. TRAIN=$tmp/train.en-de
  84. BPE_CODE=$prep/code
  85. rm -f $TRAIN
  86. for l in $src $tgt; do
  87. cat $tmp/train.$l >> $TRAIN
  88. done
  89. echo "learn_bpe.py on ${TRAIN}..."
  90. python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
  91. for L in $src $tgt; do
  92. for f in train.$L valid.$L test.$L; do
  93. echo "apply_bpe.py to ${f}..."
  94. python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
  95. done
  96. done
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...