1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
- # Copyright (c) 2017-present, Facebook, Inc.
- # All rights reserved.
- #
- # This source code is licensed under the license found in the LICENSE file in
- # the root directory of this source tree. An additional grant of patent rights
- # can be found in the PATENTS file in the same directory.
- from collections import Counter
- import re
- import torch
- SPACE_NORMALIZER = re.compile("\s+")
- def tokenize_line(line):
- line = SPACE_NORMALIZER.sub(" ", line)
- line = line.strip()
- return line.split()
- class Tokenizer:
- @staticmethod
- def add_file_to_dictionary(filename, dict, tokenize):
- with open(filename, 'r') as f:
- for line in f:
- for word in tokenize(line):
- dict.add_symbol(word)
- dict.add_symbol(dict.eos_word)
- @staticmethod
- def binarize(filename, dict, consumer, tokenize=tokenize_line,
- append_eos=True, reverse_order=False):
- nseq, ntok = 0, 0
- replaced = Counter()
- def replaced_consumer(word, idx):
- if idx == dict.unk_index and word != dict.unk_word:
- replaced.update([word])
- with open(filename, 'r') as f:
- for line in f:
- ids = Tokenizer.tokenize(
- line=line,
- dict=dict,
- tokenize=tokenize,
- add_if_not_exist=False,
- consumer=replaced_consumer,
- append_eos=append_eos,
- reverse_order=reverse_order,
- )
- nseq += 1
- consumer(ids)
- ntok += len(ids)
- return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': len(replaced)}
- @staticmethod
- def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True,
- consumer=None, append_eos=True, reverse_order=False):
- words = tokenize(line)
- if reverse_order:
- words = list(reversed(words))
- nwords = len(words)
- ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
- for i, word in enumerate(words):
- if add_if_not_exist:
- idx = dict.add_symbol(word)
- else:
- idx = dict.index(word)
- if consumer is not None:
- consumer(word, idx)
- ids[i] = idx
- if append_eos:
- ids[nwords] = dict.eos_index
- return ids
|