Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

tokenizer.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the license found in the LICENSE file in
  5. # the root directory of this source tree. An additional grant of patent rights
  6. # can be found in the PATENTS file in the same directory.
  7. from collections import Counter
  8. import re
  9. import torch
  10. SPACE_NORMALIZER = re.compile("\s+")
  11. def tokenize_line(line):
  12. line = SPACE_NORMALIZER.sub(" ", line)
  13. line = line.strip()
  14. return line.split()
  15. class Tokenizer:
  16. @staticmethod
  17. def add_file_to_dictionary(filename, dict, tokenize):
  18. with open(filename, 'r') as f:
  19. for line in f:
  20. for word in tokenize(line):
  21. dict.add_symbol(word)
  22. dict.add_symbol(dict.eos_word)
  23. @staticmethod
  24. def binarize(filename, dict, consumer, tokenize=tokenize_line,
  25. append_eos=True, reverse_order=False):
  26. nseq, ntok = 0, 0
  27. replaced = Counter()
  28. def replaced_consumer(word, idx):
  29. if idx == dict.unk_index and word != dict.unk_word:
  30. replaced.update([word])
  31. with open(filename, 'r') as f:
  32. for line in f:
  33. ids = Tokenizer.tokenize(
  34. line=line,
  35. dict=dict,
  36. tokenize=tokenize,
  37. add_if_not_exist=False,
  38. consumer=replaced_consumer,
  39. append_eos=append_eos,
  40. reverse_order=reverse_order,
  41. )
  42. nseq += 1
  43. consumer(ids)
  44. ntok += len(ids)
  45. return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': len(replaced)}
  46. @staticmethod
  47. def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True,
  48. consumer=None, append_eos=True, reverse_order=False):
  49. words = tokenize(line)
  50. if reverse_order:
  51. words = list(reversed(words))
  52. nwords = len(words)
  53. ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
  54. for i, word in enumerate(words):
  55. if add_if_not_exist:
  56. idx = dict.add_symbol(word)
  57. else:
  58. idx = dict.index(word)
  59. if consumer is not None:
  60. consumer(word, idx)
  61. ids[i] = idx
  62. if append_eos:
  63. ids[nwords] = dict.eos_index
  64. return ids
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...